From 8ef4e6c032c5ddae967e4409c5f90b2b8150bdd0 Mon Sep 17 00:00:00 2001 From: Angie Hinrichs Date: Fri, 8 Apr 2022 10:42:53 -0700 Subject: [PATCH 1/3] Install pangolin-assignment from a web directory instead of github / git-lfs. Up to this point, all data dependencies have been github cov-lineages repositories. The cache file in pangolin-assignment exceeded the github file size limit so we changed the pangolin-assignment repository to use git-lfs. Thanks @pvanheus for pointing out that github has storage and bandwidth quotas for Git LFS usage, and that by default the pangolin-assignment release tarball from github does not include the cache file; it can be added to the release tarball, but will count further against the storage and bandwidth quotas. Since the cache file is generated at UCSC which has ample web server storage and bandwidth, this adds a new mechanism to search for the latest versioned tarball in a web directory (instead of querying the github API), compare its version to the locally installed package if present (using the same pip/__init__.py __version__ mechanism), and install the tarball from the web directory (instead of github). Note: currently the URL for pangolin-assignment uses the hgdownload-test server; this will need to be changed to hgwdownload after some testing and before release. --- pangolin/utils/update.py | 66 +++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/pangolin/utils/update.py b/pangolin/utils/update.py index e1ceee9..bb27d1a 100644 --- a/pangolin/utils/update.py +++ b/pangolin/utils/update.py @@ -3,6 +3,7 @@ import os import sys import json +import re import shutil import tarfile import subprocess @@ -14,8 +15,10 @@ version_dict_keys = ['pangolin', 'scorpio', 'pangolin-data', 'constellations', 'pangolin-assignment'] +dependency_web_dir = { 'pangolin-assignment': 'https://hgdownload-test.gi.ucsc.edu/goldenPath/wuhCor1/pangolin-assignment' } -def get_latest_release(dependency): + +def get_latest_cov_lineages(dependency): """ Using the github releases API check for the latest release of dependency and its tarball """ @@ -43,31 +46,58 @@ def get_latest_release(dependency): return latest_release, latest_release_tarball -def git_lfs_install(): +def get_latest_web_dir(dependency, web_dir): """ - 'git-lfs install' must be run after installing git-lfs and before cloning a repo - that uses Git LFS. + Find the tarball url with the latest release from a web directory with versioned tarballs + instead of github. An HTTP GET of the web directory must return some text that contains + names of files in that directory, some of which are {dependency}-{version}.tar.gz. """ try: - subprocess.run(['git-lfs', 'install'], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) - except CalledProcessError as e: - sys.stderr.write(cyan(f'Error: "git-lfs install" failed: {e}')) + listing = request.urlopen(web_dir).read().decode('utf-8') + except: + sys.stderr.write(cyan(f"Unable to read {web_dir}")) + sys.exit(-1) + tarRe = re.compile(f"{dependency}-(.*?).tar.gz") + matches = list(set(tarRe.findall(listing))) + if not matches: + sys.stderr.write(cyan(f"Can't find {dependency}-.tar.gz files in listing of {web_dir}")) sys.exit(-1) + versions = [LooseVersion(v) for v in matches] + versions.sort() + latest_release = str(versions[-1]) + latest_release_tarball = f"{web_dir}/{dependency}-{latest_release}.tar.gz" + return latest_release, latest_release_tarball + + +def get_latest_release(dependency): + """ + If dependency comes from a web directory then find latest release and tarball there, otherwise + query github API for cov-lineages repo + """ + if dependency in dependency_web_dir: + return get_latest_web_dir(dependency, dependency_web_dir[dependency]) + else: + return get_latest_cov_lineages(dependency) -def pip_install_dep(dependency, release): + +def pip_install_url(url): """ - Use pip install to install a cov-lineages repository with the specificed release + Use pip install to install a package from a url. """ - url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}" subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', url], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) +def pip_install_cov_lineages(dependency, release): + """ + Use pip install to install a cov-lineages repository with the specified release + """ + url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}" + pip_install_url(url) + + def install_pangolin_assignment(): """ If the pangolin-assignment repo has not been installed already then install the latest release. @@ -77,9 +107,8 @@ def install_pangolin_assignment(): print(f"pangolin-assignment already installed with version {pangolin_assignment.__version__}; use --update or --update-data if you wish to update it.", file=sys.stderr) except: - git_lfs_install() latest_release, tarball = get_latest_release('pangolin-assignment') - pip_install_dep('pangolin-assignment', latest_release) + pip_install_url(tarball) print(f"pangolin-assignment installed with latest release ({latest_release})") @@ -99,7 +128,7 @@ def update(version_dictionary, data_dir=None): Using the github releases API check for the latest current release of the set of dependencies provided e.g., pangolin, scorpio, pangolin-data and constellations for complete --update and just pangolearn and constellations - for --update_data. If pangolin-assignment has been added to the installation + for --update_data. If pangolin-assignment has been added to version_dictionary then it will be included in both --update and --update-data. Dictionary keys must be one of pangolin, scorpio, pangolin-data, constellations @@ -163,7 +192,10 @@ def update(version_dictionary, data_dir=None): shutil.rmtree(destination_directory) shutil.move(os.path.join(tempdir, extracted_dir, dependency_package), destination_directory) else: - pip_install_dep(dependency, latest_release) + if dependency in dependency_web_dir: + pip_install_url(latest_release_tarball) + else: + pip_install_cov_lineages(dependency, latest_release) print(f"{dependency} updated to {latest_release}", file=sys.stderr) elif version > latest_release_tidied: print(f"{dependency} ({version}) is newer than latest stable " From 4fd6c2a9b23f22e213bb0fa01bee31f7965b2650 Mon Sep 17 00:00:00 2001 From: Angie Hinrichs Date: Fri, 8 Apr 2022 15:27:20 -0700 Subject: [PATCH 2/3] Remove error exit in case of mismatching pangolin-data and pangolin-assignment versions. There may be patch releases that make sense for pangolin-data but not pangolin-assignment (e.g. pangoLEARN patch update), and the suggestion to run --update-data is not helpful because that's how the versions came to be installed in the first place. --- pangolin/utils/data_checks.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pangolin/utils/data_checks.py b/pangolin/utils/data_checks.py index 05278b5..80b1e6a 100644 --- a/pangolin/utils/data_checks.py +++ b/pangolin/utils/data_checks.py @@ -95,13 +95,6 @@ def get_assignment_cache(cache_file, config): 'pangolin-assignment repository (that will make future data updates slower).\n')) sys.exit(-1) - # Check versions of pangolin-data and pangolin-assignment to make sure they are consistent. - if pangolin_assignment.__version__.lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'): - print(cyan(f'Error: pangolin_assignment cache version {pangolin_assignment.__version__} ' - f'does not match pangolin_data version {config[KEY_PANGOLIN_DATA_VERSION]}. ' - 'Run "pangolin --update-data" to fetch latest versions of both.')) - sys.exit(-1) - try: with gzip.open(cache, 'rt') as f: line = f.readline() From 5538368c6d3876e2eeae30d1cf52565e4d35423a Mon Sep 17 00:00:00 2001 From: Angie Hinrichs Date: Tue, 12 Apr 2022 12:07:03 -0700 Subject: [PATCH 3/3] Update pangolin-assignment URL to hgdownload (not -test). Also fix option name typo in github query exception message. --- pangolin/utils/update.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pangolin/utils/update.py b/pangolin/utils/update.py index 4ab811c..758103f 100644 --- a/pangolin/utils/update.py +++ b/pangolin/utils/update.py @@ -15,7 +15,7 @@ version_dict_keys = ['pangolin', 'scorpio', 'pangolin-data', 'constellations', 'pangolin-assignment'] -dependency_web_dir = { 'pangolin-assignment': 'https://hgdownload-test.gi.ucsc.edu/goldenPath/wuhCor1/pangolin-assignment' } +dependency_web_dir = { 'pangolin-assignment': 'https://hgdownload.gi.ucsc.edu/goldenPath/wuhCor1/pangolin-assignment' } def get_latest_cov_lineages(dependency): @@ -32,8 +32,8 @@ def get_latest_cov_lineages(dependency): # so if this is thrown and there is definitely connectivity then # double check the version labels except Exception as e: - sys.stderr.write(cyan("Unable to connect to reach github API " - "--update/--data_update requires internet " + sys.stderr.write(cyan("Unable to connect to reach github API. " + "--update/--update-data requires internet " "connectivity so may not work on certain " "systems or if your IP has exceeded the " f"5,000 request per hour limit\n{e}\n"))