diff --git a/ingest/Snakefile b/ingest/Snakefile index ad7e0f7c..94181fe1 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -11,7 +11,7 @@ serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4'] rule all: input: - expand(["results/sequences_{serotype}.fasta", "results/metadata_{serotype}.tsv"], serotype=serotypes) + expand(["results/{serotype}/sequences.fasta", "results/{serotype}/metadata.tsv"], serotype=serotypes) include: "rules/fetch_from_ncbi.smk" diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml index 853cafb3..bb4b8494 100644 --- a/ingest/build-configs/nextstrain-automation/config.yaml +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -17,13 +17,13 @@ s3_dst: "s3://nextstrain-data/files/workflows/dengue" files_to_upload: genbank.ndjson.xz: data/genbank.ndjson all_sequences.ndjson.xz: data/sequences.ndjson - metadata_all.tsv.zst: results/metadata_all.tsv - sequences_all.fasta.zst: results/sequences_all.fasta - metadata_denv1.tsv.zst: results/metadata_denv1.tsv - sequences_denv1.fasta.zst: results/sequences_denv1.fasta - metadata_denv2.tsv.zst: results/metadata_denv2.tsv - sequences_denv2.fasta.zst: results/sequences_denv2.fasta - metadata_denv3.tsv.zst: results/metadata_denv3.tsv - sequences_denv3.fasta.zst: results/sequences_denv3.fasta - metadata_denv4.tsv.zst: results/metadata_denv4.tsv - sequences_denv4.fasta.zst: results/sequences_denv4.fasta + all/metadata.tsv.zst: results/all/metadata.tsv + all/sequences.fasta.zst: results/all/sequences.fasta + denv1/metadata_denv1.tsv.zst: results/denv1/metadata.tsv + denv1/sequences_denv1.fasta.zst: results/denv1/sequences.fasta + denv2/metadata_denv2.tsv.zst: results/denv2/metadata.tsv + denv2/sequences_denv2.fasta.zst: results/denv2/sequences.fasta + denv3/metadata_denv3.tsv.zst: results/denv3/metadata.tsv + denv3/sequences_denv3.fasta.zst: results/denv3/sequences.fasta + denv4/metadata_denv4.tsv.zst: results/denv4/metadata.tsv + denv4/sequences_denv4.fasta.zst: results/denv4/sequences.fasta diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 391b0255..bd9e8a89 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -49,8 +49,8 @@ rule curate: all_geolocation_rules="data/all-geolocation-rules.tsv", annotations=config["curate"]["annotations"], output: - metadata="data/metadata_all.tsv", - sequences="results/sequences_all.fasta", + metadata="data/all/metadata.tsv", + sequences="results/all/sequences.fasta", log: "logs/curate.txt", params: diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 32440638..f37a79c0 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -25,11 +25,11 @@ rule nextclade_denvX: Note: If using --cds-selection, only the thoese genes are reported in the failedCdses column """ input: - sequences="results/sequences_{serotype}.fasta", + sequences="results/{serotype}/sequences.fasta", dataset="../nextclade_data/{serotype}", output: nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv", - nextclade_alignment="results/aligned_{serotype}.fasta", + nextclade_alignment="results/{serotype}/aligned.fasta", nextclade_translations=expand("data/translations/{{serotype}}/{gene}/seqs.gene.fasta", gene=config["nextclade"]["gene"]), threads: 4 params: @@ -79,7 +79,7 @@ rule append_nextclade_columns: Append the nextclade results to the metadata """ input: - metadata="data/metadata_all.tsv", + metadata="data/all/metadata.tsv", genotype_nextclade="results/nextclade_genotypes.tsv", output: metadata_all="data/metadata_nextclade.tsv", @@ -136,7 +136,7 @@ rule append_gene_coverage_columns: metadata="data/metadata_nextclade.tsv", gene_coverage=expand("results/{gene}/gene_coverage_all.tsv", gene=config["nextclade"]["gene"]) output: - metadata_all="results/metadata_all.tsv", + metadata_all="results/all/metadata.tsv", params: id_field=config["curate"]["id_field"], shell: @@ -159,9 +159,9 @@ rule split_metadata_by_serotype: Split the metadata by serotype """ input: - metadata="results/metadata_all.tsv", + metadata="results/all/metadata.tsv", output: - serotype_metadata="results/metadata_{serotype}.tsv" + serotype_metadata="results/{serotype}/metadata.tsv" wildcard_constraints: serotype=SEROTYPE_CONSTRAINTS params: diff --git a/ingest/rules/split_serotypes.smk b/ingest/rules/split_serotypes.smk index 29b22547..9503bcdb 100644 --- a/ingest/rules/split_serotypes.smk +++ b/ingest/rules/split_serotypes.smk @@ -17,10 +17,10 @@ rule split_by_serotype_genbank: Split the data by serotype based on the NCBI Genbank metadata. """ input: - metadata = "data/metadata_all.tsv", - sequences = "results/sequences_all.fasta" + metadata = "data/all/metadata.tsv", + sequences = "results/all/sequences.fasta" output: - sequences = "results/sequences_{serotype}.fasta" + sequences = "results/{serotype}/sequences.fasta" params: id_field = config["curate"]["id_field"], serotype_field = config["curate"]["serotype_field"] diff --git a/nextclade/config/config_dengue.yaml b/nextclade/config/config_dengue.yaml index 00e15564..6b0fb188 100644 --- a/nextclade/config/config_dengue.yaml +++ b/nextclade/config/config_dengue.yaml @@ -1,8 +1,8 @@ # Sequences must be FASTA and metadata must be TSV # Both files must be zstd compressed # Both files must have a {serotype} expandable field to be replaced by all, denv1-denv4 -sequences_url: "https://data.nextstrain.org/files/workflows/dengue/sequences_{serotype}.fasta.zst" -metadata_url: "https://data.nextstrain.org/files/workflows/dengue/metadata_{serotype}.tsv.zst" +sequences_url: "https://data.nextstrain.org/files/workflows/dengue/{serotype}/sequences.fasta.zst" +metadata_url: "https://data.nextstrain.org/files/workflows/dengue/{serotype}/metadata.tsv.zst" strain_id_field: "genbank_accession" display_strain_field: "strain" diff --git a/phylogenetic/config/config_dengue.yaml b/phylogenetic/config/config_dengue.yaml index fbcd7263..e6bb8531 100644 --- a/phylogenetic/config/config_dengue.yaml +++ b/phylogenetic/config/config_dengue.yaml @@ -1,8 +1,8 @@ # Sequences must be FASTA and metadata must be TSV # Both files must be zstd compressed # Both files must have a {serotype} expandable field to be replaced by all, denv1-denv4 -sequences_url: "https://data.nextstrain.org/files/workflows/dengue/sequences_{serotype}.fasta.zst" -metadata_url: "https://data.nextstrain.org/files/workflows/dengue/metadata_{serotype}.tsv.zst" +sequences_url: "https://data.nextstrain.org/files/workflows/dengue/{serotype}/sequences.fasta.zst" +metadata_url: "https://data.nextstrain.org/files/workflows/dengue/{serotype}/metadata.tsv.zst" strain_id_field: "genbank_accession" display_strain_field: "strain" diff --git a/phylogenetic/config/description.md b/phylogenetic/config/description.md index bc09c381..4e323d91 100644 --- a/phylogenetic/config/description.md +++ b/phylogenetic/config/description.md @@ -2,13 +2,13 @@ We gratefully acknowledge the authors, originating and submitting laboratories o We curate sequence data and metadata from NCBI as starting point for our analyses. Curated sequences and metadata are available as flat files at: -* [data.nextstrain.org/files/workflows/dengue/sequences_all.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_all.fasta.zst) -* [data.nextstrain.org/files/workflows/dengue/metadata_all.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_all.tsv.zst) -* [data.nextstrain.org/files/workflows/dengue/sequences_denv1.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_denv1.fasta.zst) -* [data.nextstrain.org/files/workflows/dengue/metadata_denv1.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_denv1.tsv.zst) -* [data.nextstrain.org/files/workflows/dengue/sequences_denv2.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_denv2.fasta.zst) -* [data.nextstrain.org/files/workflows/dengue/metadata_denv2.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_denv2.tsv.zst) -* [data.nextstrain.org/files/workflows/dengue/sequences_denv3.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_denv3.fasta.zst) -* [data.nextstrain.org/files/workflows/dengue/metadata_denv3.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_denv3.tsv.zst) -* [data.nextstrain.org/files/workflows/dengue/sequences_denv4.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_denv4.fasta.zst) -* [data.nextstrain.org/files/workflows/dengue/metadata_denv4.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_denv4.tsv.zst) +* [data.nextstrain.org/files/workflows/dengue/all/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/all/sequences.fasta.zst) +* [data.nextstrain.org/files/workflows/dengue/all/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/all/metadata.tsv.zst) +* [data.nextstrain.org/files/workflows/dengue/denv1/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/denv1/sequences.fasta.zst) +* [data.nextstrain.org/files/workflows/dengue/denv1/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/denv1/metadata.tsv.zst) +* [data.nextstrain.org/files/workflows/dengue/denv2/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/denv2/sequences.fasta.zst) +* [data.nextstrain.org/files/workflows/dengue/denv2/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/denv2/metadata.tsv.zst) +* [data.nextstrain.org/files/workflows/dengue/denv3/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/denv3/sequences.fasta.zst) +* [data.nextstrain.org/files/workflows/dengue/denv3/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/denv3/metadata.tsv.zst) +* [data.nextstrain.org/files/workflows/dengue/denv4/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/denv4/sequences.fasta.zst) +* [data.nextstrain.org/files/workflows/dengue/denv4/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/denv4/metadata.tsv.zst)