qiita-spots · antgonza · Apr 22, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 8, 2025
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,3 @@
+[run]
+branch = True
+relative_files = True
diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml
@@ -2,7 +2,7 @@
 
 on:
   push:
-    branches: [dev]
+    branches: [master]
   pull_request:
 
 jobs:
@@ -90,11 +90,10 @@ jobs:
           pip --quiet install https://github.com/qiita-spots/qtp-job-output-folder/archive/refs/heads/main.zip
 
           # pip --quiet install .
-          pip install .
+          export QP_KLP_CONFIG_FP=`pwd`/tests/configuration.json
+          pip install -e .
           pip --quiet install coveralls
 
-          export QP_KLP_CONFIG_FP=`pwd`/configuration.json
-
           configure_qtp_job_output_folder --env-script "source /home/runner/.profile; conda activate klp" --ca-cert $QIITA_ROOTCA_CERT
           configure_klp --env-script "source /home/runner/.profile; export QP_KLP_CONFIG_FP=$QP_KLP_CONFIG_FP; conda activate klp" --ca-cert $QIITA_ROOTCA_CERT
 
@@ -131,17 +130,18 @@ jobs:
           sleep 10
           cat /tmp/supervisord.log
 
-      - name: Main tests
+      - name: qp-knight-lab-processing & mg-scripts tests
         shell: bash -l {0}
         env:
           COVER_PACKAGE: ${{ matrix.cover_package }}
         run: |
           conda activate klp
           export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
           export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
-          export QP_KLP_CONFIG_FP=`pwd`/configuration.json
+          export QP_KLP_CONFIG_FP=`pwd`/tests/configuration.json
           export PYTHONWARNINGS="ignore:Certificate for localhost has no \`subjectAltName\`"
-          nosetests --with-doctest --with-coverage -v --cover-package=qp_klp
+          nosetests --with-doctest --with-coverage -v --cover-package qp_klp,sequence_processing_pipeline
+
       - uses: codecov/codecov-action@v3
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
@@ -160,4 +160,4 @@ jobs:
       - name: lint
         run: |
           pip install -q flake8
-          flake8 qp_klp
+          flake8 .
diff --git a/.gitignore b/.gitignore
@@ -129,4 +129,7 @@ dmypy.json
 .pyre/
 
 # ignore local changes
-qp_klp/tests/data/sample-sheets/*/*/*.csv
+tests/data/sample-sheets/*/*/*.csv
+
+# test output
+tests/data/output_dir/
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,9 @@
 BSD 3-Clause License
 
-Copyright (c) 2014--, Qiita Spots
+Copyright (c) 2014, Qiita Spots
+Copyright (c) 2020, biocore
+Copyright (c) 2021, jdereus
+
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/README.md b/README.md
@@ -0,0 +1,68 @@
+# Sequence Processing Pipeline
+
+A Jupyter notebook to assist wet lab shotgun pipeline.
+A packaged Python-based implementation of Knight Lab's sequencing process pipeline.
+
+## Installation
+
+To install this package, first clone this repository from GitHub:
+
+```bash
+git clone https://github.com/biocore/mg-scripts.git
+```
+
+Create a Python3 Conda environment in which to run the notebook:
+
+```bash
+conda create --yes -n spp python='python=3.9' scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair
+```
+
+Activate the Conda environment:
+
+```bash
+source activate sp_pipeline
+```
+
+Change directory to the cloned repository folder and install:
+
+```bash
+cd mg-scripts
+pip install -e .
+```
+
+This will automatically install https://github.com/biocore/metagenomics_pooling_notebook.git, a dependency of mg-scripts and the sequence_processing_pipeline.
+
+## Running Unittests
+
+Change directory to the downloaded repository folder:
+
+```bash
+cd mg-scripts
+nosetests --with-coverage --cover-inclusive --cover-package sequence_processing_pipeline
+```
+
+## Getting Started
+
+Review Pipeline.py and main.py to learn how to import and access package functionality:
+
+```bash
+cd mg-scripts/sequence_processing_pipeline
+more Pipeline.py
+more main.py
+```
+
+Adjust configuration settings as needed:
+
+```bash
+vi tests/data/configuration.json
+```
+
+Please note that the setting 'minimap2_databases' is expected to be a list of paths to individual .mmi files for QCJob.
+For NuQCJob, minimap2_databases is expected to be the path to a directory containing two subdirectories: 'metagenomic'
+and 'metatranscriptomic'. Each directory should contain or symlink to the appropriate .mmi files needed for that Assay
+type.
+
+Additional TellSeq-related notes:
+'spades-cloudspades-0.1', 'tellread-release-novaseqX' or similar directories must be placed in a location available to SPP.
+Their paths should be made known to SPP in the configuration files. (See examples for details).
+Additional scripts found in sequence_processing_pipeline/contrib were contributed by Daniel and Omar and can be similarly located and configured.
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,21 +2,28 @@
 requires = ["setuptools", "setuptools-scm"]
 build-backend = "setuptools.build_meta"
 
+[tool.hatch.build.targets.wheel]
+packages = ["qp_klp", "sequence_processing_pipeline"]
+
+[tool.poetry]
+packages = [
+    { include = "src/qp_klp", from = "./", to = "qp_klp" },
+    { include = "src/sequence_processing_pipeline", from = "./", to = "sequence_processing_pipeline" },
+]
+
 [project]
-name = "qp_klp"
+name = "knight_lab_processing"
 # version strings must comply with PEP 440:
 # https://peps.python.org/pep-0440/
-version = "2022.4"
-authors = [
-    {name = "Qiita Development Team", email = "qiita.help@gmail.com"},
-]
+version = "2025.4"
+authors = [{ name = "Qiita Development Team", email = "qiita.help@gmail.com" }]
 description = "Qiita Plugin: Knight Lab Processing"
 readme = "README.rst"
 # ">=" required as most available versions of 3.9 will be "3.9.17" or
 # similar.
 requires-python = ">=3.9"
 keywords = ["bioinformatics", "qiita", "sequence_processing_pipeline"]
-license = {text = "BSD"}
+license = { text = "BSD" }
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "License :: OSI Approved :: BSD License",
@@ -27,17 +34,26 @@ classifiers = [
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: Implementation :: CPython",
     "Operating System :: POSIX :: Linux",
-    "Operating System :: MacOS :: MacOS X"
-    ]
+    "Operating System :: MacOS :: MacOS X",
+]
 dependencies = [
     "nose>=0.10.1",
     "click>=3.3",
     "future",
     "pandas",
+    'requests',
+    'flake8',
+    'nose',
+    'coverage',
+    'pgzip',
+    'jinja2',
+    'numpy',
+    'cython',
     "qiita-files@https://github.com/qiita-spots/qiita-files/archive/master.zip",
     "qiita_client@https://github.com/qiita-spots/qiita_client/archive/master.zip",
-    "sequence-processing-pipeline@https://github.com/biocore/mg-scripts/archive/master.zip"
+    'metapool@https://github.com/biocore/metagenomics_pooling_notebook/archive/master.zip',
 ]
 [project.scripts]
 configure_klp = "qp_klp.scripts.configure_klp:config"
 start_klp = "qp_klp.scripts.start_klp:execute"
+demux = "sequence_processing_pipeline.scripts.cli:demux"
diff --git a/qp_klp/tests/data/configuration.json b/qp_klp/tests/data/configuration.json
diff --git a/qp_klp/tests/data/multiqc-bclconvert-config.yaml b/qp_klp/tests/data/multiqc-bclconvert-config.yaml
diff --git a/qp_klp/Assays.py → src/qp_klp/Assays.py b/qp_klp/Assays.py → src/qp_klp/Assays.py
diff --git a/qp_klp/FailedSamplesRecord.py → src/qp_klp/FailedSamplesRecord.py b/qp_klp/FailedSamplesRecord.py → src/qp_klp/FailedSamplesRecord.py
diff --git a/qp_klp/Protocol.py → src/qp_klp/Protocol.py b/qp_klp/Protocol.py → src/qp_klp/Protocol.py
diff --git a/qp_klp/StandardAmpliconWorkflow.py → src/qp_klp/StandardAmpliconWorkflow.py b/qp_klp/StandardAmpliconWorkflow.py → src/qp_klp/StandardAmpliconWorkflow.py
diff --git a/qp_klp/StandardMetagenomicWorkflow.py → src/qp_klp/StandardMetagenomicWorkflow.py b/qp_klp/StandardMetagenomicWorkflow.py → src/qp_klp/StandardMetagenomicWorkflow.py
diff --git a/qp_klp/StandardMetatranscriptomicWorkflow.py → ...klp/StandardMetatranscriptomicWorkflow.py b/qp_klp/StandardMetatranscriptomicWorkflow.py → ...klp/StandardMetatranscriptomicWorkflow.py
diff --git a/qp_klp/TellseqMetagenomicWorkflow.py → src/qp_klp/TellseqMetagenomicWorkflow.py b/qp_klp/TellseqMetagenomicWorkflow.py → src/qp_klp/TellseqMetagenomicWorkflow.py
diff --git a/qp_klp/WorkflowFactory.py → src/qp_klp/WorkflowFactory.py b/qp_klp/WorkflowFactory.py → src/qp_klp/WorkflowFactory.py
diff --git a/qp_klp/Workflows.py → src/qp_klp/Workflows.py b/qp_klp/Workflows.py → src/qp_klp/Workflows.py
diff --git a/qp_klp/__init__.py → src/qp_klp/__init__.py b/qp_klp/__init__.py → src/qp_klp/__init__.py
diff --git a/qp_klp/klp.py → src/qp_klp/klp.py b/qp_klp/klp.py → src/qp_klp/klp.py
diff --git a/qp_klp/scripts/configure_klp.py → src/qp_klp/scripts/configure_klp.py b/qp_klp/scripts/configure_klp.py → src/qp_klp/scripts/configure_klp.py
diff --git a/qp_klp/scripts/start_klp.py → src/qp_klp/scripts/start_klp.py b/qp_klp/scripts/start_klp.py → src/qp_klp/scripts/start_klp.py
diff --git a/src/sequence_processing_pipeline/Commands.py b/src/sequence_processing_pipeline/Commands.py
@@ -0,0 +1,158 @@
+import glob
+import gzip
+import os
+from sequence_processing_pipeline.util import (iter_paired_files,
+                                               determine_orientation)
+
+
+def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
+                            batch_prefix):
+    '''Partitions input fastqs to coarse bins
+
+    :param data_location_path: Path to the ConvertJob directory.
+    :param max_file_list_size_in_gb: Upper threshold for file-size.
+    :param batch_prefix: Path + file-name prefix for output-files.
+    :return: The number of output-files created, size of largest bin.
+    '''
+    # to prevent issues w/filenames like the ones below from being mistaken
+    # for R1 or R2 files, use determine_orientation().
+    # LS_8_22_2014_R2_SRE_S2_L007_I1_001.fastq.gz
+    # LS_8_22_2014_R1_SRE_S3_L007_I1_001.fastq.gz
+
+    # since the names of all fastq files are being scanned for orientation,
+    # collect all of them instead of mistakenly pre-filtering some files.
+    fastq_paths = glob.glob(data_location_path + '/*/*.fastq.gz')
+    fastq_paths = [x for x in fastq_paths
+                   if determine_orientation(x) in ['R1', 'R2']]
+
+    # convert from GB and halve as we sum R1
+    max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2)
+
+    split_offset = 0
+
+    # ensure we are max-sized to start.
+    current_size = max_size * 10
+    fp = None
+
+    bucket_size = 0
+    max_bucket_size = 0
+
+    for a, b in iter_paired_files(fastq_paths):
+        r1_size = os.stat(a).st_size
+        r2_size = os.stat(b).st_size
+
+        output_base = os.path.dirname(a).split('/')[-1]
+        if current_size + r1_size > max_size:
+            # bucket is full.
+            if bucket_size > max_bucket_size:
+                max_bucket_size = bucket_size
+
+            # reset bucket_size.
+            bucket_size = r1_size + r2_size
+
+            if fp is not None:
+                fp.close()
+
+            split_offset += 1
+            current_size = r1_size
+            fp = open(batch_prefix + '-%d' % split_offset, 'w')
+        else:
+            # add to bucket_size
+            bucket_size += r1_size + r2_size
+            current_size += r1_size
+
+        fp.write("%s\t%s\t%s\n" % (a, b, output_base))
+
+    if fp is not None:
+        fp.close()
+
+    if split_offset == 0:
+        raise ValueError("No splits made")
+
+    return split_offset, max_bucket_size
+
+
+def demux_cmd(id_map_fp, fp_fp, out_d, task, maxtask):
+    with open(id_map_fp, 'r') as f:
+        id_map = f.readlines()
+        id_map = [line.strip().split('\t') for line in id_map]
+
+    # fp needs to be an open file handle.
+    # ensure task and maxtask are proper ints when coming from cmd-line.
+    with open(fp_fp, 'r') as fp:
+        demux(id_map, fp, out_d, int(task), int(maxtask))
+
+
+def demux(id_map, fp, out_d, task, maxtask):
+    """Split infile data based in provided map"""
+    delimiter = '::MUX::'
+    mode = 'wt'
+    ext = '.fastq.gz'
+    sep = '/'
+    rec = '@'
+
+    openfps = {}
+
+    for offset, (idx, r1, r2, outbase) in enumerate(id_map):
+        if offset % maxtask == task:
+            idx = rec + idx
+
+            # setup output locations
+            outdir = out_d + sep + outbase
+            fullname_r1 = outdir + sep + r1 + ext
+            fullname_r2 = outdir + sep + r2 + ext
+
+            os.makedirs(outdir, exist_ok=True)
+            current_fp_r1 = gzip.open(fullname_r1, mode)
+            current_fp_r2 = gzip.open(fullname_r2, mode)
+            current_fp = {'1': current_fp_r1, '2': current_fp_r2}
+            openfps[idx] = current_fp
+
+    # setup a parser
+    seq_id = iter(fp)
+    seq = iter(fp)
+    dumb = iter(fp)
+    qual = iter(fp)
+
+    for i, s, d, q in zip(seq_id, seq, dumb, qual):
+        # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1'
+        # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1 BX:Z:TATGACACATGCGGCCCT' # noqa
+        # '@baz/1
+
+        # NB: from 6d794a37-12cd-4f8e-95d6-72a4b8a1ec1c's only-adapter-filtered results: # noqa
+        # @A00953:244:HYHYWDSXY:3:1101:14082:3740 1:N:0:CCGTAAGA+TCTAACGC
+
+        fname_encoded, sid = i.split(delimiter, 1)
+
+        if fname_encoded not in openfps:
+            continue
+
+        current_fp = openfps[fname_encoded]
+
+        # remove '\n' from sid and split on all whitespace.
+        tmp = sid.strip().split()
+
+        if len(tmp) == 1:
+            # sequence id line contains no optional metadata.
+            # don't change sid.
+            # -1 is \n
+            orientation = sid[-2]
+            sid = rec + sid
+        elif len(tmp) == 2:
+            sid = tmp[0]
+            metadata = tmp[1]
+            # no '\n'
+            orientation = sid[-1]
+            # hexdump confirms separator is ' ', not '\t'
+            sid = rec + sid + ' ' + metadata + '\n'
+        else:
+            raise ValueError(f"'{sid}' is not a recognized form")
+
+        current_fp[orientation].write(sid)
+        current_fp[orientation].write(s)
+        current_fp[orientation].write(d)
+        current_fp[orientation].write(q)
+
+    for d in openfps.values():
+        for f in d.values():
+            f.close()