Skip to content

Commit

Permalink
community[minor]: [GoogleApiYoutubeLoader] Replace API used in _get_d…
Browse files Browse the repository at this point in the history
…ocument_for_channel from search to playlistItem (#24034)

- **Description:** Search has a limit of 500 results, playlistItems
doesn't. Added a class in except clause to catch another common error.
- **Issue:** None
- **Dependencies:** None
- **Twitter handle:** @TupleType

---------

Co-authored-by: asi-cider <88270351+asi-cider@users.noreply.github.com>
Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
  • Loading branch information
3 people authored Jul 19, 2024
1 parent 6a45bf9 commit 372c27f
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 11 deletions.
29 changes: 19 additions & 10 deletions libs/community/langchain_community/document_loaders/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse
from xml.etree.ElementTree import ParseError # OK: trusted-source

from langchain_core.documents import Document
from langchain_core.pydantic_v1 import root_validator
Expand All @@ -28,6 +29,8 @@ class GoogleApiClient:
As the google api expects credentials you need to set up a google account and
register your Service. "https://developers.google.com/docs/api/quickstart/python"
*Security Note*: Note that parsing of the transcripts relies on the standard
xml library but the input is viewed as trusted in this case.
Example:
Expand Down Expand Up @@ -437,6 +440,14 @@ def _get_channel_id(self, channel_name: str) -> str:
channel_id = response["items"][0]["id"]["channelId"]
return channel_id

def _get_uploads_playlist_id(self, channel_id: str) -> str:
request = self.youtube_client.channels().list(
part="contentDetails",
id=channel_id,
)
response = request.execute()
return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
try:
from youtube_transcript_api import (
Expand All @@ -452,34 +463,32 @@ def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Documen
)

channel_id = self._get_channel_id(channel)
request = self.youtube_client.search().list(
uploads_playlist_id = self._get_uploads_playlist_id(channel_id)
request = self.youtube_client.playlistItems().list(
part="id,snippet",
channelId=channel_id,
maxResults=50, # adjust this value to retrieve more or fewer videos
playlistId=uploads_playlist_id,
maxResults=50,
)
video_ids = []
while request is not None:
response = request.execute()

# Add each video ID to the list
for item in response["items"]:
if not item["id"].get("videoId"):
continue
meta_data = {"videoId": item["id"]["videoId"]}
video_id = item["snippet"]["resourceId"]["videoId"]
meta_data = {"videoId": video_id}
if self.add_video_info:
item["snippet"].pop("thumbnails")
meta_data.update(item["snippet"])
try:
page_content = self._get_transcripe_for_video_id(
item["id"]["videoId"]
)
page_content = self._get_transcripe_for_video_id(video_id)
video_ids.append(
Document(
page_content=page_content,
metadata=meta_data,
)
)
except (TranscriptsDisabled, NoTranscriptFound) as e:
except (TranscriptsDisabled, NoTranscriptFound, ParseError) as e:
if self.continue_on_failure:
logger.error(
"Error fetching transscript "
Expand Down
2 changes: 1 addition & 1 deletion libs/community/scripts/lint_imports.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ fi
# is very nuanced and depends on the user's environment.
# https://docs.python.org/3/library/xml.etree.elementtree.html

result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in" || true)
result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in| # OK: trusted-source" || true)

if [ -n "$result" ]; then
echo "ERROR: The following lines need to be updated:"
Expand Down

0 comments on commit 372c27f

Please # to comment.