1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
import re
from requests import Session
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
CouldNotRetrieveTranscript,
IpBlocked,
TranscriptsDisabled,
VideoUnavailable,
)
class IPBlockedError(Exception):
pass
class TranscriptNotAvailableError(Exception):
pass
class TranscriptDisabledError(Exception):
pass
class InvalidURLError(Exception):
pass
_YOUTUBE_URL_PATTERNS = [
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([A-Za-z0-9_-]{11})"),
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/embed/([A-Za-z0-9_-]{11})"),
re.compile(r"(?:https?://)?youtu\.be/([A-Za-z0-9_-]{11})"),
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/shorts/([A-Za-z0-9_-]{11})"),
]
def extract_video_id(url: str) -> str:
for pattern in _YOUTUBE_URL_PATTERNS:
match = pattern.search(url)
if match:
return match.group(1)
raise InvalidURLError(f"Could not extract YouTube video ID from URL: {url}")
def create_api() -> YouTubeTranscriptApi:
"""Create a YouTubeTranscriptApi with a hardened browser-like Session."""
session = Session()
session.headers.update(
{
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.5",
}
)
return YouTubeTranscriptApi(http_client=session)
def fetch_transcript_by_id(video_id: str, api: YouTubeTranscriptApi) -> list[dict]:
"""Fetch a transcript by video_id using a pre-built API instance."""
try:
transcript = api.fetch(video_id)
except IpBlocked as e:
raise IPBlockedError(f"YouTube is blocking requests from this IP. {e}")
except VideoUnavailable:
raise TranscriptNotAvailableError(f"Video {video_id} is unavailable")
except TranscriptsDisabled:
raise TranscriptDisabledError(
f"Transcripts are disabled for video {video_id}"
)
except CouldNotRetrieveTranscript:
raise TranscriptNotAvailableError(
f"No transcript available for video {video_id}"
)
return transcript.to_raw_data()
def fetch_transcript(url: str) -> tuple[str, list[dict]]:
video_id = extract_video_id(url)
api = create_api()
segments = fetch_transcript_by_id(video_id, api)
return video_id, segments
|