Arker123 · ARinger22 · Mar 3, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 node_modules
 .env
+condense/__pycache__
+condense.egg-info/
 client/coverage
-server/coverage
+server/coverage
diff --git a/audio-to-text-transcription/youtube_audio_to_text.py b/audio-to-text-transcription/youtube_audio_to_text.py
diff --git a/condense/main.py b/condense/main.py
@@ -0,0 +1 @@
+# Copyright (C) 2024 Condense, Inc. All Rights Reserved.
diff --git a/condense/transcript.py b/condense/transcript.py
@@ -0,0 +1,82 @@
+# Copyright (C) 2024 Condense, Inc. All Rights Reserved.
+import re
+import sys
+import logging
+import argparse
+
+import youtube_transcript_api
+
+from condense.utils import save_to_file
+from condense.youtube_audio_extractor import get_transcript_from_video
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+
+def make_parser() -> argparse.ArgumentParser:
+    """
+    Create the argument parser.
+    """
+    parser = argparse.ArgumentParser(
+        description="Get the transcript for a video",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        dest="video_url",
+        type=str,
+        required=True,
+        help="The URL of the video to get the transcript for",
+    )
+
+    output_group = parser.add_argument_group("rendering arguments")
+    output_group.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
+    output_group.add_argument("-t", "--text", action="store_true", help="emit text instead of JSON")
+    output_group.add_argument("-c", "--csv", action="store_true", help="emit CSV instead of JSON")
+
+    return parser
+
+
+def get_transcript(argv: argparse.Namespace) -> list[dict[str, str]]:
+    video_url = argv.video_url
+    if "youtube.com/watch?v=" in video_url:
+        video_id_match = re.search(r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=(?P<url>[^&]+)", video_url)
+    elif "youtu.be" in video_url:
+        video_id_match = re.search(r"(?:https?://)?(?:www\.)?youtu\.be/(?P<url>[^&]+)", video_url)
+
+    if video_id_match:
+        video_id = video_id_match.group("url")
+    else:
+        raise ValueError("Invalid YouTube URL.")
+
+    transcript = None
+    try:
+        transcript = youtube_transcript_api.YouTubeTranscriptApi.get_transcript(video_id)
+    except youtube_transcript_api._errors.TranscriptsDisabled:
+        logger.info("Transcripts are disabled for this video, using audio to text instead.")
+
+    if transcript:
+        captions = []
+        for segment in transcript:
+            start = segment["start"]
+            end = segment["start"] + segment["duration"]
+            text = segment["text"]
+            captions.append({"start": start, "end": end, "text": text})
+        return captions
+    else:
+        captions, _ = get_transcript_from_video(argv)
+        return captions
+
+
+def main(argv: list[str] = None) -> int:
+    parser = make_parser()
+    argv = parser.parse_args(argv)
+    logging.basicConfig(level=logging.DEBUG)
+
+    transcript = get_transcript(argv)
+    save_to_file(transcript, argv)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/condense/utils.py b/condense/utils.py
@@ -0,0 +1,77 @@
+import os
+import csv
+import json
+import logging
+import argparse
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+
+def save_to_text(data: list[dict[str, str]], text_filename: str) -> True:
+    """
+    Save the transcript text to a text file.
+    """
+    with open(text_filename, mode="w", encoding="utf-8") as text_file:
+        for caption in data:
+            start = caption["start"]
+            end = caption["end"]
+            text = caption["text"]
+            text_file.write(f"{start} --> {end}  ")
+            text_file.write(f"{text}\n")
+
+
+def save_to_csv(data: list[dict[str, str]], csv_filename: str) -> True:
+    """
+    Save the transcript data to a CSV file.
+    """
+    with open(csv_filename, mode="w", encoding="utf-8") as csv_file:
+        fieldnames = ["start", "end", "text"]
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for field in data:
+            writer.writerow(
+                {
+                    fieldnames[0]: field[fieldnames[0]],
+                    fieldnames[1]: field[fieldnames[1]],
+                    fieldnames[2]: field[fieldnames[2]],
+                }
+            )
+
+
+def save_to_file(data: list[dict[str, str]], args: argparse.Namespace) -> True:
+    output_path = "transcripts"
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    json_count = 0
+    text_count = 0
+    csv_count = 0
+
+    # Save data to JSON
+    if args.json:
+        json_filename = "transcript_data.json"
+        while os.path.exists(os.path.join(output_path, json_filename)):
+            json_count += 1
+            json_filename = f"transcript_data{json_count}.json"
+        with open(os.path.join(output_path, json_filename), "w") as json_file:
+            json.dump(data, json_file, indent=4)
+            logger.info("Transcript data saved to %s", json_filename)
+
+    # Save data to text
+    if args.text:
+        text_filename = "transcript_text.txt"
+        while os.path.exists(os.path.join(output_path, text_filename)):
+            text_count += 1
+            text_filename = f"transcript_text{text_count}.txt"
+        save_to_text(data, os.path.join(output_path, text_filename))
+        logger.info("Transcript text saved to %s", text_filename)
+
+    # Save data to CSV
+    if args.csv:
+        csv_filename = "transcript_data.csv"
+        while os.path.exists(os.path.join(output_path, csv_filename)):
+            csv_count += 1
+            csv_filename = f"transcript_data{csv_count}.csv"
+        save_to_csv(data, os.path.join(output_path, csv_filename))
+        logger.info("Transcript data saved to %s", csv_filename)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Copyright (C) 2024 Condense, Inc. All Rights Reserved.