Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
node_modules
.env
condense/__pycache__
condense.egg-info/
client/coverage
server/coverage
server/coverage
164 changes: 0 additions & 164 deletions audio-to-text-transcription/youtube_audio_to_text.py

This file was deleted.

1 change: 1 addition & 0 deletions condense/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Copyright (C) 2024 Condense, Inc. All Rights Reserved.
82 changes: 82 additions & 0 deletions condense/transcript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright (C) 2024 Condense, Inc. All Rights Reserved.
import re
import sys
import logging
import argparse

import youtube_transcript_api

from condense.utils import save_to_file
from condense.youtube_audio_extractor import get_transcript_from_video

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


def make_parser() -> argparse.ArgumentParser:
"""
Create the argument parser.
"""
parser = argparse.ArgumentParser(
description="Get the transcript for a video",
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"-u",
"--url",
dest="video_url",
type=str,
required=True,
help="The URL of the video to get the transcript for",
)

output_group = parser.add_argument_group("rendering arguments")
output_group.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
output_group.add_argument("-t", "--text", action="store_true", help="emit text instead of JSON")
output_group.add_argument("-c", "--csv", action="store_true", help="emit CSV instead of JSON")

return parser


def get_transcript(argv: argparse.Namespace) -> list[dict[str, str]]:
video_url = argv.video_url
if "youtube.com/watch?v=" in video_url:
video_id_match = re.search(r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=(?P<url>[^&]+)", video_url)
elif "youtu.be" in video_url:
video_id_match = re.search(r"(?:https?://)?(?:www\.)?youtu\.be/(?P<url>[^&]+)", video_url)

if video_id_match:
video_id = video_id_match.group("url")
else:
raise ValueError("Invalid YouTube URL.")

transcript = None
try:
transcript = youtube_transcript_api.YouTubeTranscriptApi.get_transcript(video_id)
except youtube_transcript_api._errors.TranscriptsDisabled:
logger.info("Transcripts are disabled for this video, using audio to text instead.")

if transcript:
captions = []
for segment in transcript:
start = segment["start"]
end = segment["start"] + segment["duration"]
text = segment["text"]
captions.append({"start": start, "end": end, "text": text})
return captions
else:
captions, _ = get_transcript_from_video(argv)
return captions


def main(argv: list[str] = None) -> int:
parser = make_parser()
argv = parser.parse_args(argv)
logging.basicConfig(level=logging.DEBUG)

transcript = get_transcript(argv)
save_to_file(transcript, argv)


if __name__ == "__main__":
sys.exit(main())
77 changes: 77 additions & 0 deletions condense/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os
import csv
import json
import logging
import argparse

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


def save_to_text(data: list[dict[str, str]], text_filename: str) -> True:
"""
Save the transcript text to a text file.
"""
with open(text_filename, mode="w", encoding="utf-8") as text_file:
for caption in data:
start = caption["start"]
end = caption["end"]
text = caption["text"]
text_file.write(f"{start} --> {end} ")
text_file.write(f"{text}\n")


def save_to_csv(data: list[dict[str, str]], csv_filename: str) -> True:
"""
Save the transcript data to a CSV file.
"""
with open(csv_filename, mode="w", encoding="utf-8") as csv_file:
fieldnames = ["start", "end", "text"]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for field in data:
writer.writerow(
{
fieldnames[0]: field[fieldnames[0]],
fieldnames[1]: field[fieldnames[1]],
fieldnames[2]: field[fieldnames[2]],
}
)


def save_to_file(data: list[dict[str, str]], args: argparse.Namespace) -> True:
output_path = "transcripts"
if not os.path.exists(output_path):
os.makedirs(output_path)

json_count = 0
text_count = 0
csv_count = 0

# Save data to JSON
if args.json:
json_filename = "transcript_data.json"
while os.path.exists(os.path.join(output_path, json_filename)):
json_count += 1
json_filename = f"transcript_data{json_count}.json"
with open(os.path.join(output_path, json_filename), "w") as json_file:
json.dump(data, json_file, indent=4)
logger.info("Transcript data saved to %s", json_filename)

# Save data to text
if args.text:
text_filename = "transcript_text.txt"
while os.path.exists(os.path.join(output_path, text_filename)):
text_count += 1
text_filename = f"transcript_text{text_count}.txt"
save_to_text(data, os.path.join(output_path, text_filename))
logger.info("Transcript text saved to %s", text_filename)

# Save data to CSV
if args.csv:
csv_filename = "transcript_data.csv"
while os.path.exists(os.path.join(output_path, csv_filename)):
csv_count += 1
csv_filename = f"transcript_data{csv_count}.csv"
save_to_csv(data, os.path.join(output_path, csv_filename))
logger.info("Transcript data saved to %s", csv_filename)
Loading