Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restructure s3 urls #73

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']

rule all:
input:
expand(["results/sequences_{serotype}.fasta", "results/metadata_{serotype}.tsv"], serotype=serotypes)
expand(["results/{serotype}/sequences.fasta", "results/{serotype}/metadata.tsv"], serotype=serotypes)


include: "rules/fetch_from_ncbi.smk"
Expand Down
20 changes: 10 additions & 10 deletions ingest/build-configs/nextstrain-automation/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ s3_dst: "s3://nextstrain-data/files/workflows/dengue"
files_to_upload:
genbank.ndjson.xz: data/genbank.ndjson
all_sequences.ndjson.xz: data/sequences.ndjson
metadata_all.tsv.zst: results/metadata_all.tsv
sequences_all.fasta.zst: results/sequences_all.fasta
metadata_denv1.tsv.zst: results/metadata_denv1.tsv
sequences_denv1.fasta.zst: results/sequences_denv1.fasta
metadata_denv2.tsv.zst: results/metadata_denv2.tsv
sequences_denv2.fasta.zst: results/sequences_denv2.fasta
metadata_denv3.tsv.zst: results/metadata_denv3.tsv
sequences_denv3.fasta.zst: results/sequences_denv3.fasta
metadata_denv4.tsv.zst: results/metadata_denv4.tsv
sequences_denv4.fasta.zst: results/sequences_denv4.fasta
all/metadata.tsv.zst: results/all/metadata.tsv
all/sequences.fasta.zst: results/all/sequences.fasta
denv1/metadata_denv1.tsv.zst: results/denv1/metadata.tsv
denv1/sequences_denv1.fasta.zst: results/denv1/sequences.fasta
denv2/metadata_denv2.tsv.zst: results/denv2/metadata.tsv
denv2/sequences_denv2.fasta.zst: results/denv2/sequences.fasta
denv3/metadata_denv3.tsv.zst: results/denv3/metadata.tsv
denv3/sequences_denv3.fasta.zst: results/denv3/sequences.fasta
denv4/metadata_denv4.tsv.zst: results/denv4/metadata.tsv
denv4/sequences_denv4.fasta.zst: results/denv4/sequences.fasta
4 changes: 2 additions & 2 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ rule curate:
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["curate"]["annotations"],
output:
metadata="data/metadata_all.tsv",
sequences="results/sequences_all.fasta",
metadata="data/all/metadata.tsv",
sequences="results/all/sequences.fasta",
log:
"logs/curate.txt",
params:
Expand Down
12 changes: 6 additions & 6 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ rule nextclade_denvX:
Note: If using --cds-selection, only the thoese genes are reported in the failedCdses column
"""
input:
sequences="results/sequences_{serotype}.fasta",
sequences="results/{serotype}/sequences.fasta",
dataset="../nextclade_data/{serotype}",
output:
nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv",
nextclade_alignment="results/aligned_{serotype}.fasta",
nextclade_alignment="results/{serotype}/aligned.fasta",
nextclade_translations=expand("data/translations/{{serotype}}/{gene}/seqs.gene.fasta", gene=config["nextclade"]["gene"]),
threads: 4
params:
Expand Down Expand Up @@ -79,7 +79,7 @@ rule append_nextclade_columns:
Append the nextclade results to the metadata
"""
input:
metadata="data/metadata_all.tsv",
metadata="data/all/metadata.tsv",
genotype_nextclade="results/nextclade_genotypes.tsv",
output:
metadata_all="data/metadata_nextclade.tsv",
Expand Down Expand Up @@ -136,7 +136,7 @@ rule append_gene_coverage_columns:
metadata="data/metadata_nextclade.tsv",
gene_coverage=expand("results/{gene}/gene_coverage_all.tsv", gene=config["nextclade"]["gene"])
output:
metadata_all="results/metadata_all.tsv",
metadata_all="results/all/metadata.tsv",
params:
id_field=config["curate"]["id_field"],
shell:
Expand All @@ -159,9 +159,9 @@ rule split_metadata_by_serotype:
Split the metadata by serotype
"""
input:
metadata="results/metadata_all.tsv",
metadata="results/all/metadata.tsv",
output:
serotype_metadata="results/metadata_{serotype}.tsv"
serotype_metadata="results/{serotype}/metadata.tsv"
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS
params:
Expand Down
6 changes: 3 additions & 3 deletions ingest/rules/split_serotypes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ rule split_by_serotype_genbank:
Split the data by serotype based on the NCBI Genbank metadata.
"""
input:
metadata = "data/metadata_all.tsv",
sequences = "results/sequences_all.fasta"
metadata = "data/all/metadata.tsv",
sequences = "results/all/sequences.fasta"
output:
sequences = "results/sequences_{serotype}.fasta"
sequences = "results/{serotype}/sequences.fasta"
params:
id_field = config["curate"]["id_field"],
serotype_field = config["curate"]["serotype_field"]
Expand Down
4 changes: 2 additions & 2 deletions nextclade/config/config_dengue.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Sequences must be FASTA and metadata must be TSV
# Both files must be zstd compressed
# Both files must have a {serotype} expandable field to be replaced by all, denv1-denv4
sequences_url: "https://data.nextstrain.org/files/workflows/dengue/sequences_{serotype}.fasta.zst"
metadata_url: "https://data.nextstrain.org/files/workflows/dengue/metadata_{serotype}.tsv.zst"
sequences_url: "https://data.nextstrain.org/files/workflows/dengue/{serotype}/sequences.fasta.zst"
metadata_url: "https://data.nextstrain.org/files/workflows/dengue/{serotype}/metadata.tsv.zst"

strain_id_field: "genbank_accession"
display_strain_field: "strain"
Expand Down
4 changes: 2 additions & 2 deletions phylogenetic/config/config_dengue.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Sequences must be FASTA and metadata must be TSV
# Both files must be zstd compressed
# Both files must have a {serotype} expandable field to be replaced by all, denv1-denv4
sequences_url: "https://data.nextstrain.org/files/workflows/dengue/sequences_{serotype}.fasta.zst"
metadata_url: "https://data.nextstrain.org/files/workflows/dengue/metadata_{serotype}.tsv.zst"
sequences_url: "https://data.nextstrain.org/files/workflows/dengue/{serotype}/sequences.fasta.zst"
metadata_url: "https://data.nextstrain.org/files/workflows/dengue/{serotype}/metadata.tsv.zst"

strain_id_field: "genbank_accession"
display_strain_field: "strain"
Expand Down
20 changes: 10 additions & 10 deletions phylogenetic/config/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ We gratefully acknowledge the authors, originating and submitting laboratories o

We curate sequence data and metadata from NCBI as starting point for our analyses. Curated sequences and metadata are available as flat files at:

* [data.nextstrain.org/files/workflows/dengue/sequences_all.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_all.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/metadata_all.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_all.tsv.zst)
* [data.nextstrain.org/files/workflows/dengue/sequences_denv1.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_denv1.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/metadata_denv1.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_denv1.tsv.zst)
* [data.nextstrain.org/files/workflows/dengue/sequences_denv2.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_denv2.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/metadata_denv2.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_denv2.tsv.zst)
* [data.nextstrain.org/files/workflows/dengue/sequences_denv3.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_denv3.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/metadata_denv3.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_denv3.tsv.zst)
* [data.nextstrain.org/files/workflows/dengue/sequences_denv4.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/sequences_denv4.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/metadata_denv4.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/metadata_denv4.tsv.zst)
* [data.nextstrain.org/files/workflows/dengue/all/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/all/sequences.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/all/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/all/metadata.tsv.zst)
* [data.nextstrain.org/files/workflows/dengue/denv1/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/denv1/sequences.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/denv1/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/denv1/metadata.tsv.zst)
* [data.nextstrain.org/files/workflows/dengue/denv2/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/denv2/sequences.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/denv2/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/denv2/metadata.tsv.zst)
* [data.nextstrain.org/files/workflows/dengue/denv3/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/denv3/sequences.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/denv3/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/denv3/metadata.tsv.zst)
* [data.nextstrain.org/files/workflows/dengue/denv4/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/dengue/denv4/sequences.fasta.zst)
* [data.nextstrain.org/files/workflows/dengue/denv4/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/dengue/denv4/metadata.tsv.zst)