diff --git a/tool/compare_commits.py b/tool/compare_commits.py index bd262f4d..de1d3ff6 100644 --- a/tool/compare_commits.py +++ b/tool/compare_commits.py @@ -50,13 +50,30 @@ def tag_format(tag, package_name, repo_name): return tag_formats -def find_existing_tags(tag_formats, repo_name): +def find_existing_tags_batch(tag_formats, repo_name): + # Get all tags in one request; MAY FAIL if the repo has too many tags + tags_url = f"https://api.github.com/repos/{repo_name}/git/refs/tags" + response = make_github_request(tags_url) + + if not response: + return [] + elif response == 504: + for tag_format in tag_formats: + response = make_github_request(f"{tags_url}/{tag_format}") + if response: + return [tag_format] + return [] + + # Create a map of all tags + all_tags = {ref["ref"].replace("refs/tags/", ""): ref for ref in response} + + # Find the matching tag formats + matching_tags = [] for tag_format in tag_formats: - tag_url = f"https://api.github.com/repos/{repo_name}/git/ref/tags/{tag_format}" - response = make_github_request(tag_url, silent=True) - if response: - return tag_format - return None + if tag_format in all_tags: + matching_tags.append(tag_format) + + return matching_tags if matching_tags else [] def get_commit_info(commit): @@ -136,47 +153,33 @@ def get_authors_from_response(url, data, package_info): def get_authors_from_tags(tag1, tag2, package, package_info): repo_name = package_info.get("repo_name") tag_formats_old = tag_format(tag1, package, repo_name) - existing_tag_format_old = find_existing_tags(tag_formats_old, repo_name) + existing_tag_format_old = find_existing_tags_batch(tag_formats_old, repo_name) + logging.info(f"Existing tag format old: {existing_tag_format_old}") tag_formats_new = tag_format(tag2, package, repo_name) - existing_tag_format_new = find_existing_tags(tag_formats_new, repo_name) - category = package_info.get("message") - - compare_url = ( - f"https://api.github.com/repos/{repo_name}/compare/{existing_tag_format_old}...{existing_tag_format_new}" - ) - response = ( - make_github_request(compare_url, max_retries=2) - if existing_tag_format_old and existing_tag_format_new - else None - ) + existing_tag_format_new = find_existing_tags_batch(tag_formats_new, repo_name) + logging.info(f"Existing tag format new: {existing_tag_format_new}") - if not response: + if not existing_tag_format_old: status_old = "GitHub old tag not found" + if not existing_tag_format_new: status_new = "GitHub new tag not found" - old_tag_found, new_tag_found = False, False - if existing_tag_format_old: - status_old = existing_tag_format_old - old_tag_found = True - for tag_old in tag_formats_old: - old_tag_url = f"https://api.github.com/repos/{repo_name}/git/ref/tags/{tag_old}" - response = requests.get(old_tag_url) - if response.status_code == 200: - status_old = tag_old - old_tag_found = True - break - - if not old_tag_found: - for tag_new in tag_formats_new: - new_tag_url = f"https://api.github.com/repos/{repo_name}/git/ref/tags/{tag_new}" - response = requests.get(new_tag_url) - if response.status_code == 200: - status_new = tag_new - new_tag_found = True - break + + response = None + for old_tag, new_tag in zip(existing_tag_format_old, existing_tag_format_new): + logging.info(f"Old tag: {old_tag}, New tag: {new_tag}") + compare_url = f"https://api.github.com/repos/{repo_name}/compare/{old_tag}...{new_tag}" + response = make_github_request(compare_url, max_retries=2) + if response: + logging.info(f"Found response for {old_tag}...{new_tag}") + break + + if not response: + status_old = "GitHub old tag not found" if not existing_tag_format_old else existing_tag_format_old[0] + status_new = "GitHub new tag not found" if not existing_tag_format_new else existing_tag_format_new[0] return { - "tag1": existing_tag_format_old if existing_tag_format_old else list(tag_formats_old)[-1], - "tag2": existing_tag_format_new if existing_tag_format_new else list(tag_formats_new)[-1], + "tag1": existing_tag_format_old[0] if existing_tag_format_old else list(tag_formats_old)[0], + "tag2": existing_tag_format_new[0] if existing_tag_format_new else list(tag_formats_new)[0], "status_old": status_old, "status_new": status_new, "category": "Upgraded package", diff --git a/tool/get_pr_info.py b/tool/get_pr_info.py index f0b58ebd..ee4dc087 100644 --- a/tool/get_pr_info.py +++ b/tool/get_pr_info.py @@ -1,15 +1,12 @@ -import requests -import sqlite3 import os -import json -import time -import copy import logging +from typing import List, Dict, Tuple from tool.tool_config import get_cache_manager, make_github_request cache_manager = get_cache_manager() GITHUB_TOKEN = os.getenv("GITHUB_API_TOKEN") +BATCH_SIZE = 100 # Configurable batch size headers = { "Authorization": f"Bearer {GITHUB_TOKEN}", @@ -19,123 +16,233 @@ url = "https://api.github.com/graphql" -def fetch_pull_requests(commit_node_id): - query = """ - query Edges($nodeId: ID!, $first: Int) { - node(id: $nodeId) { - ... on Commit { - associatedPullRequests(first: $first) { - edges { - node { - author { - login - __typename - } - authorAssociation - autoMergeRequest { - mergeMethod - enabledBy { - login - } - authorEmail - } - checksUrl - createdAt - mergeCommit { - author { - name - email - } - } - id - merged - mergedAt - mergedBy { - login - __typename - } - number - state - url - reviews(first: $first, states: APPROVED) { - edges { - node { - author { - login - __typename - } - id - state - createdAt - publishedAt - submittedAt - updatedAt - } +def fetch_and_cache_batch(commit_batch: List[Tuple[str, str, str, str]]) -> List[Dict]: + """ + Fetch and cache PR information for a batch of commits. + + Args: + commit_batch: List of tuples (node_id, commit_sha, package, repo_name) + + Returns: + List of processed PR information dictionaries + """ + if not commit_batch: + return [] + + # Build the GraphQL query for this batch + query_parts = [] + variables = {"first": 5} # Number of PRs to fetch per commit + + for i, (node_id, _, _, _) in enumerate(commit_batch): + variables[f"nodeId{i}"] = node_id + query_parts.append( + f""" + node{i}: node(id: $nodeId{i}) {{ + ... on Commit {{ + associatedPullRequests(first: $first) {{ + edges {{ + node {{ + author {{ + login + __typename + }} + authorAssociation + autoMergeRequest {{ + mergeMethod + enabledBy {{ + login + }} + authorEmail + }} + checksUrl + createdAt + mergeCommit {{ + author {{ + name + email + }} + }} + id + merged + mergedAt + mergedBy {{ + login + __typename + }} + number + state + url + reviews(first: $first, states: APPROVED) {{ + edges {{ + node {{ + author {{ + login + __typename + }} + id + state + createdAt + publishedAt + submittedAt + updatedAt + }} + }} + }} + repository {{ + name + owner {{ + login + id + }} + }} + }} + }} + }} + }} + }} + """ + ) + + # Construct the full query + query = ( + "query(" + + ", ".join([f"$nodeId{i}: ID!" for i in range(len(commit_batch))]) + + ", $first: Int) {" + + "\n".join(query_parts) + + "}" + ) + + # Execute the query + body = {"query": query, "variables": variables} + response = make_github_request(url, method="POST", json_data=body, headers=headers, max_retries=3) + + batch_results = [] + if response and "data" in response: + # Process and cache each result immediately + for i, (node_id, commit_sha, package, repo_name) in enumerate(commit_batch): + node_key = f"node{i}" + pr_info = {} + if ( + node_key in response["data"] + and response["data"][node_key] is not None + and "associatedPullRequests" in response["data"][node_key] + ): + pr_info = {"data": {"node": response["data"][node_key]}} + + # Cache immediately after processing each item + cache_manager.github_cache.cache_pr_info( + { + "package": package, + "commit_sha": commit_sha, + "commit_node_id": node_id, + "pr_info": pr_info, } + ) + + # Add to batch results + batch_results.append( + { + "package": package, + "commit_sha": commit_sha, + "commit_node_id": node_id, + "pr_info": pr_info, + "repo_name": repo_name, } - repository { - name - owner { - login - id + ) + + logging.info(f"Processed and cached PR info for commit {commit_sha} in {package}") + else: + # Handle error case + logging.error(f"Failed to fetch PR information for batch of size {len(commit_batch)}") + logging.error(f"Response: {response}") + # Cache empty results for failed requests to prevent repeated failures + for node_id, commit_sha, package, repo_name in commit_batch: + cache_manager.github_cache.cache_pr_info( + { + "package": package, + "commit_sha": commit_sha, + "commit_node_id": node_id, + "pr_info": {}, } + ) + batch_results.append( + { + "package": package, + "commit_sha": commit_sha, + "commit_node_id": node_id, + "pr_info": {}, + "repo_name": repo_name, } - } - } - } - } - } - } - """ + ) - variables = { - "nodeId": f"{commit_node_id}", - "first": 5, - } - body = {"query": query, "variables": variables} - return make_github_request(url, method="POST", json_data=body, headers=headers, max_retries=5) + return batch_results + + +def get_pr_info(data: Dict) -> List[Dict]: + """ + Get PR information for all commits, processing in batches and caching gradually. + Args: + data: Dictionary containing commit information by package -def get_pr_info(data): + Returns: + List of PR information dictionaries + """ logging.info("Getting PR info for commits...") pr_infos = [] + commits_to_process = [] - commits_data = copy.deepcopy(data) - - for package, info in commits_data.items(): + # First pass: collect commits that need processing + for package, info in data.items(): repo_name = info.get("repo_name") - logging.info(f"Checking PR info in {package}'s repository: {repo_name}") authors = info.get("authors", []) for author in authors: commit_sha = author.get("sha") commit_node_id = author.get("node_id") - commit_url = author.get("commit_url") + if not commit_node_id: + continue + + # Check cache first pr_data = cache_manager.github_cache.get_pr_info(commit_node_id) - if not pr_data: - if commit_node_id: - pr_info = fetch_pull_requests(commit_node_id) - cache_manager.github_cache.cache_pr_info( - { - "package": package, - "commit_sha": commit_sha, - "commit_node_id": commit_node_id, - "pr_info": pr_info, - } - ) + if pr_data: + # Use cached data + pr_infos.append( + { + "package": package, + "commit_sha": commit_sha, + "commit_node_id": commit_node_id, + "pr_info": pr_data["pr_info"], + "repo_name": repo_name, + } + ) else: - pr_info = pr_data["pr_info"] + # Add to list for batch processing + commits_to_process.append((commit_node_id, commit_sha, package, repo_name)) + + # Process commits in batches + total_commits = len(commits_to_process) + if total_commits > 0: + logging.info(f"Processing {total_commits} commits in batches of {BATCH_SIZE}") - all_info = { - "package": package, - "commit_sha": commit_sha, - "commit_node_id": commit_node_id, - "pr_info": pr_info, - "repo_name": repo_name, - } - pr_infos.append(all_info) + for i in range(0, total_commits, BATCH_SIZE): + batch = commits_to_process[i : i + BATCH_SIZE] + logging.info(f"Processing batch {i//BATCH_SIZE + 1}/{(total_commits + BATCH_SIZE - 1)//BATCH_SIZE}") + + try: + batch_results = fetch_and_cache_batch(batch) + pr_infos.extend(batch_results) + + # Log progress + processed = min(i + BATCH_SIZE, total_commits) + logging.info(f"Processed {processed}/{total_commits} commits ({processed/total_commits*100:.1f}%)") + + except Exception as e: + logging.error(f"Error processing batch: {e}") + # Continue with next batch instead of failing completely + continue return pr_infos @@ -156,7 +263,6 @@ def get_useful_pr_info(commits_data): .get("associatedPullRequests", {}) .get("edges", []) ) - for author in commits_data[package].get("authors", []): if author.get("node_id") == commit_node_id: author["commit_merged_info"] = [] diff --git a/tool/get_pr_review.py b/tool/get_pr_review.py index 0628e24b..ff465c59 100644 --- a/tool/get_pr_review.py +++ b/tool/get_pr_review.py @@ -19,60 +19,110 @@ url = "https://api.github.com/graphql" -def get_first_pr_info(repo_name, review_author_login): - query = """ - query($query: String!, $type: SearchType!, $last: Int!) - {search(query: $query, type: $type, last: $last) - { - nodes { - ... on PullRequest { - mergedAt - merged - mergedBy { - login - } - authorAssociation - reviews(first:1, states:APPROVED){ - edges{ - node{ - id - author{ - login - __typename - url - } - authorAssociation - createdAt - publishedAt - submittedAt - state - repository{ - owner{ - login - } - name - } - } - } - } - } - } - } - } - - - """ - - search_string = f"repo:{repo_name} is:pr reviewed-by:{review_author_login} sort:author-date-asc" - variables = {"query": f"{search_string}", "last": 1, "type": "ISSUE"} - body = {"query": query, "variables": variables} - return make_github_request(url, method="POST", json_data=body, headers=headers) +def get_multiple_pr_info(repo_name, review_author_logins): + # Build dynamic query with aliases + query_fragments = [] + variables = {} + + for i, login in enumerate(review_author_logins): + alias = f"search_{i}" + query_fragments.append( + f""" + {alias}: search(query: $query_{i}, type: ISSUE, last: 1) {{ + nodes {{ + ... on PullRequest {{ + mergedAt + merged + mergedBy {{ + login + }} + authorAssociation + reviews(first:1, states:APPROVED) {{ + edges {{ + node {{ + id + author {{ + login + __typename + url + }} + authorAssociation + createdAt + publishedAt + submittedAt + state + repository {{ + owner {{ + login + }} + name + }} + }} + }} + }} + }} + }} + }} + """ + ) + variables[f"query_{i}"] = f"repo:{repo_name} is:pr reviewed-by:{login} sort:author-date-asc" + + # Combine all query fragments + complete_query = """ + query({}) {{ + {} + }} + """.format( + ", ".join(f"$query_{i}: String!" for i in range(len(review_author_logins))), "\n".join(query_fragments) + ) + + body = {"query": complete_query, "variables": variables} + response = make_github_request(url, method="POST", json_data=body, headers=headers) + + # Restructure response to match expected format + if "data" in response: + queries = [] + for i in range(len(review_author_logins)): + search_data = response["data"].get(f"search_{i}") + if search_data: + queries.append(search_data) + return {"data": {"queries": queries}} + return response def get_pr_review_info(data): logging.info("Getting PR review info...") pr_data = copy.deepcopy(data) + # Collect all uncached reviewer lookups needed + uncached_lookups = [] + for package, info in pr_data.items(): + for author in info.get("authors", []): + for merge_info in author.get("commit_merged_info", []): + if merge_info.get("state") != "MERGED": + continue + + repo_name = merge_info.get("repo") + for reviewer in merge_info.get("reviews", []): + review_author_login = reviewer.get("review_author") + if not review_author_login: + continue + + if not cache_manager.github_cache.get_pr_review(repo_name, review_author_login): + uncached_lookups.append((repo_name, review_author_login)) + + # Batch fetch uncached reviewers by repository + by_repo = {} + for repo_name, login in uncached_lookups: + by_repo.setdefault(repo_name, set()).add(login) + + for repo_name, logins in by_repo.items(): + response = get_multiple_pr_info(repo_name, list(logins)) + # Cache individual results + for login, result in zip(logins, response.get("data", {}).get("queries", [])): + cache_manager.github_cache.cache_pr_review(package, repo_name, login, {"data": {"search": result}}) + + # Process the data using cached results for package, info in pr_data.items(): authors = info.get("authors", []) if authors: @@ -93,12 +143,6 @@ def get_pr_review_info(data): review_author_login = reviewer.get("review_author") review_id = reviewer.get("review_id") first_pr_info = cache_manager.github_cache.get_pr_review(repo_name, review_author_login) - if not first_pr_info: - if review_author_login: - first_pr_info = get_first_pr_info(repo_name, review_author_login) - cache_manager.github_cache.cache_pr_review( - package, repo_name, review_author_login, first_pr_info - ) useful_info = first_pr_info.get("data", {}).get("search", {}).get("nodes", []) first_review_info = useful_info[0] if useful_info else {} all_useful_first_prr_info = first_review_info.get("reviews", {}).get("edges", []) @@ -144,5 +188,4 @@ def get_pr_review_info(data): info["prr_data"] = None logging.info("PR review info processed.") - return pr_data diff --git a/tool/report_static.py b/tool/report_static.py index 7540f069..239a0998 100644 --- a/tool/report_static.py +++ b/tool/report_static.py @@ -265,16 +265,16 @@ def write_summary( no_source_code_repo_df = df.loc[ df["github_url"] == "No_repo_info_found", - ["github_url", "github_exists"] + ["command"] if package_manager == "maven" else [], + ["github_url", "github_exists"] + (["command"] if package_manager == "maven" else []), ] github_repo_404_df = df.loc[ df["github_exists"] == False, - ["github_url", "github_exists"] + ["command"] if package_manager == "maven" else [], + ["github_url", "github_exists"] + (["command"] if package_manager == "maven" else []), ] not_on_github_df = ( df.loc[ df["is_github"] == False, - ["github_url"] + ["command"] if package_manager == "maven" else [], + ["github_url"] + (["command"] if package_manager == "maven" else []), ] .reset_index(drop=False) .drop_duplicates(subset=["package_name"]) @@ -297,9 +297,7 @@ def write_summary( "tag_related_info", "status_code_for_release_tag", ] - + ["command"] - if package_manager == "maven" - else [] + + (["command"] if package_manager == "maven" else []) ), ] # all_deprecated_df = df[df["all_deprecated"] is True] @@ -318,9 +316,7 @@ def write_summary( "github_url", "parent_repo_link", ] - + ["command"] - if package_manager == "maven" - else [] + + (["command"] if package_manager == "maven" else []) ), ] provenance_df = df.loc[ diff --git a/tool/static_analysis.py b/tool/static_analysis.py index 1355ebea..784a3666 100644 --- a/tool/static_analysis.py +++ b/tool/static_analysis.py @@ -10,7 +10,7 @@ import re from tool.tool_config import get_cache_manager, make_github_request -from tool.compare_commits import tag_format as construct_tag_format +from tool.compare_commits import tag_format as construct_tag_format, find_existing_tags_batch import logging import xmltodict @@ -171,7 +171,9 @@ def check_maven_signature(package_name, package_version): # Regular expression to extract the PGP signature section pgp_signature_pattern = re.compile(r"PGP signature:\n(?:[ \t]*.+\n)*?[ \t]*status:\s*(\w+)", re.MULTILINE) match = pgp_signature_pattern.search(output.stdout) + logging.info(f"Code Signature match: {match}") if match: + logging.info(f"Matched, signature match: {match.group(1)}") # Extract the status status = match.group(1).strip().lower() return {"signature_present": True, "signature_valid": status == "valid"} @@ -385,25 +387,24 @@ def check_existence(package_name, repository, extract_message, package_manager): have_no_tags_data = have_no_tags_response.json() if len(have_no_tags_data) == 0: + logging.info(f"No tags found for {package_name} in {repo_api}") release_tag_url = None tag_related_info = "No tag was found in the repo" status_code_release_tag = have_no_tags_response_status_code else: tag_possible_formats = construct_tag_format(version, package_full_name, repo_name=simplified_path) - # Making the default case not finding the tag - tag_related_info = "The given tag was not found in the repo" - if tag_possible_formats: - for tag_format in tag_possible_formats: - tag_url = f"{repo_api}/git/ref/tags/{tag_format}" - response = make_github_request(tag_url, silent=True) - if response: - release_tag_exists = True - release_tag_url = tag_url - tag_related_info = f"Tag {tag_format} is found in the repo" - status_code_release_tag = 200 - break - if not release_tag_exists: - logging.info(f"No tags found for {package_name} in {repo_api}") + existing_tag_format = find_existing_tags_batch(tag_possible_formats, simplified_path) + logging.info(f"Existing tag format: {existing_tag_format}") + if existing_tag_format: + existing_tag_format = existing_tag_format[0] + release_tag_exists = True + release_tag_url = f"{repo_api}/git/ref/tags/{existing_tag_format}" + tag_related_info = f"Tag {existing_tag_format} is found in the repo" + status_code_release_tag = 200 + else: + release_tag_url = None + tag_related_info = "The given tag was not found in the repo" + status_code_release_tag = 404 github_info = { "is_github": True, diff --git a/tool/tool_config.py b/tool/tool_config.py index d211381e..a2ddf142 100644 --- a/tool/tool_config.py +++ b/tool/tool_config.py @@ -841,6 +841,11 @@ def make_github_request( if not silent: logging.warning(f"Request failed: {e}") if attempt == max_retries - 1: + if e.response.status_code in [ + 502, + 504, + ]: # timeout, sometimes happens when the request is too large (e.g., too many tags) + return 504 return None time.sleep(retry_delay * (attempt + 1))