Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Fix github api pagination #247

Merged
merged 1 commit into from
Feb 8, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 83 additions & 19 deletions app/modules/code_provider/github/github_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional, Tuple

import aiohttp
import chardet
import requests
from fastapi import HTTPException
Expand Down Expand Up @@ -187,23 +188,71 @@ async def get_repos_for_user(self, user_id: str):

auth = AppAuth(app_id=app_id, private_key=private_key)
jwt = auth.create_jwt()
installations_url = "https://api.github.com/app/installations"

# Fetch all installations with pagination
all_installations = []
base_url = "https://api.github.com/app/installations"
headers = {
"Accept": "application/vnd.github+json",
"Authorization": f"Bearer {jwt}",
"X-GitHub-Api-Version": "2022-11-28",
}

response = requests.get(installations_url, headers=headers)

if response.status_code != 200:
logger.error(f"Failed to get installations. Response: {response.text}")
raise HTTPException(
status_code=response.status_code,
detail=f"Failed to get installations: {response.text}",
)

all_installations = response.json()
async with aiohttp.ClientSession() as session:
# Get first page to determine total pages
async with session.get(base_url, headers=headers) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Failed to get installations. Response: {error_text}")
raise HTTPException(
status_code=response.status,
detail=f"Failed to get installations: {error_text}",
)

# Extract last page number from Link header
last_page = 1
if "Link" in response.headers:
link_header = response.headers["Link"]
links = requests.utils.parse_header_links(link_header)
for link in links:
if link["rel"] == "last":
last_page = int(link["url"].split("page=")[-1])
break

# Add first page's data
first_page_data = await response.json()
all_installations.extend(first_page_data)

logger.info(f"Total pages to fetch: {last_page}")

# Generate remaining page URLs (skip page 1 since we already have it)
page_urls = [f"{base_url}?page={page}" for page in range(2, last_page + 1)]

# Process URLs in batches of 10
async def fetch_page(url):
try:
async with session.get(url, headers=headers) as response:
if response.status == 200:
installations = await response.json()
logger.info(f"Fetched {len(installations)} installations from {url}")
return installations
else:
error_text = await response.text()
logger.error(f"Failed to fetch page {url}. Response: {error_text}")
return []
except Exception as e:
logger.error(f"Error fetching page {url}: {str(e)}")
return []

# Process URLs in batches of 10
for i in range(0, len(page_urls), 10):
batch = page_urls[i:i + 10]
batch_tasks = [fetch_page(url) for url in batch]
batch_results = await asyncio.gather(*batch_tasks)
for installations in batch_results:
all_installations.extend(installations)

logger.info(f"Final number of installations collected: {len(all_installations)}")

# Filter installations: user's personal installation + org installations where user is a member
user_installations = []
Expand All @@ -217,20 +266,35 @@ async def get_repos_for_user(self, user_id: str):
elif account_type == "Organization" and account_login in org_logins:
user_installations.append(installation)


repos = []
for installation in user_installations:
app_auth = auth.get_installation_auth(installation["id"])
github = Github(auth=app_auth)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Remove unused variable.
'github' is assigned to but never used, which might confuse readers and violate lint checks.

- github = Github(auth=app_auth)
  repos_url = installation["repositories_url"]
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
github = Github(auth=app_auth)
repos_url = installation["repositories_url"]
🧰 Tools
🪛 Ruff (0.8.2)

273-273: Local variable github is assigned to but never used

Remove assignment to unused variable github

(F841)

repos_url = installation["repositories_url"]
repos_response = requests.get(
repos_url, headers={"Authorization": f"Bearer {app_auth.token}"}
)
if repos_response.status_code == 200:
repos.extend(repos_response.json().get("repositories", []))
else:
logger.error(
f"Failed to fetch repositories for installation ID {installation['id']}. Response: {repos_response.text}"

# Handle pagination for repositories as well
next_repos_url = repos_url
while next_repos_url:
repos_response = requests.get(
next_repos_url,
headers={"Authorization": f"Bearer {app_auth.token}"}
)

if repos_response.status_code == 200:
repos.extend(repos_response.json().get("repositories", []))

# Check for next page in Link header
next_repos_url = None
if "links" in repos_response.__dict__:
for link in repos_response.links.values():
if link.get("rel") == "next":
next_repos_url = link.get("url")
break
else:
Comment on lines +275 to +294
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Consider using aiohttp for repository pagination to maintain consistency.
You’re performing synchronous GET requests inside an async method, which can block the event loop. Switch to the existing aiohttp session (or open another context) for fetching paginated repositories to leverage concurrency benefits.

Example approach:

- repos_response = requests.get(
-     next_repos_url,
-     headers={"Authorization": f"Bearer {app_auth.token}"}
- )
+ async with session.get(
+     next_repos_url,
+     headers={"Authorization": f"Bearer {app_auth.token}"}
+ ) as response:
+     if response.status == 200:
+         data = await response.json()
+         repos.extend(data.get("repositories", []))
+         # Continue processing

Committable suggestion skipped: line range outside the PR's diff.

logger.error(
f"Failed to fetch repositories for installation ID {installation['id']}. Response: {repos_response.text}"
)

# Remove duplicate repositories if any
unique_repos = {repo["id"]: repo for repo in repos}.values()
Expand Down
Loading