Merge pull request #114 from adaptyvbio/fix_splitting_bug

Fix splitting bug
adaptyvbio · Sep 14, 2023 · 212f7e2 · 212f7e2
2 parents 2e98490 + df0ff7c
commit 212f7e2
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 4 deletions.
diff --git a/.conda/default/meta.yaml b/.conda/default/meta.yaml
@@ -42,6 +42,7 @@ requirements:
     - bs4
     - rcsbsearch
     - mmseqs2
+    - foldseek
 
 about:
   home: https://github.com/adaptyvbio/ProteinFlow

diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,9 @@
 FROM python:3.10-buster
 RUN python -m pip install --upgrade pip setuptools wheel
 RUN wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz; tar xvfz mmseqs-linux-avx2.tar.gz;
+RUN wget https://mmseqs.com/foldseek/foldseek-linux-avx2.tar.gz; tar xvzf foldseek-linux-avx2.tar.gz;
 RUN python -m pip install prody==2.4.0
 RUN python -m pip install "rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e"
 RUN python -m pip install proteinflow
 RUN echo "export PATH=$(pwd)/mmseqs/bin/:$PATH" >> ~/.bashrc
+RUN echo "export PATH=$(pwd)/foldseek/bin/:$PATH" >> ~/.bashrc
diff --git a/proteinflow/__init__.py b/proteinflow/__init__.py
@@ -249,6 +249,7 @@ def generate_data(
     load_ligands=False,
     exclude_chains_without_ligands=False,
     tanimoto_clustering=False,
+    foldseek=False,
     require_ligand=False,
     random_seed=42,
     max_chains=10,
@@ -337,6 +338,8 @@ def generate_data(
         if `True`, exclude biounits that don't contain ligands
     tanimoto_clustering : bool, default False
         if `True`, cluster the biounits based on ligand Tanimoto similarity
+    foldseek : bool, default False
+        if `True`, cluster the biounits based on structure similarity
     require_ligand : bool, default False
         if `True`, only use biounits that contain a ligand
     random_seed : int, default 42
@@ -424,6 +427,7 @@ def generate_data(
             random_seed=random_seed,
             exclude_chains_without_ligands=exclude_chains_without_ligands,
             tanimoto_clustering=tanimoto_clustering,
+            foldseek=foldseek,
         )
     shutil.rmtree(tmp_folder)
     return log_dict

diff --git a/proteinflow/cli.py b/proteinflow/cli.py
@@ -187,6 +187,11 @@ def download(**kwargs):
     is_flag=True,
     help="Whether to use Tanimoto Clustering instead of MMSeqs2. Only works if load_ligands is set to True",
 )
+@click.option(
+    "--foldseek",
+    is_flag=True,
+    help="Whether to use FoldSeek to cluster the dataset",
+)
 @click.option(
     "--require_ligand",
     is_flag=True,

diff --git a/proteinflow/split/__init__.py b/proteinflow/split/__init__.py
@@ -97,7 +97,7 @@ def _run_foldseek(data_folder, tmp_folder, min_seq_id):
     subprocess.run(["rm", "-r", os.path.join(tmp_folder, folder, "tmp")])
 
 
-def _read_clusters(tmp_folder, cdr=None):
+def _read_clusters(tmp_folder, cdr=None, foldseek=False):
     """Read the output from MMSeqs2 and produces 2 dictionaries that store the clusters information.
 
     In cluster_dict, values are the full names (pdb + chains) whereas in cluster_pdb_dict, values are just the PDB ids (so less clusters but bigger).
@@ -122,7 +122,7 @@ def _read_clusters(tmp_folder, cdr=None):
                 sequence_name = line[1:-1]
                 cluster_name = "".join(cluster_name.split(".pdb"))
                 sequence_name = "".join(sequence_name.split(".pdb"))
-                if "-" in cluster_name:
+                if foldseek:
                     cluster_name = cluster_name[:4] + cluster_name[6:]
                     sequence_name = sequence_name[:4] + sequence_name[6:]
                 if cdr is not None:
@@ -131,7 +131,7 @@ def _read_clusters(tmp_folder, cdr=None):
             elif line[0] == ">":
                 sequence_name = line[1:-1]
                 sequence_name = "".join(sequence_name.split(".pdb"))
-                if "-" in sequence_name:
+                if foldseek:
                     sequence_name = sequence_name[:4] + sequence_name[6:]
                 found_header = True
 
@@ -1157,12 +1157,14 @@ def _build_dataset_partition(
                 merged_seqs_dict = _load_pdbs(
                     dataset_dir, cdr=cdr
                 )  # keys: pdb_id, values: list of chains and sequences
+
                 lengths = []
                 for k, v in merged_seqs_dict.items():
                     lengths += [len(x[1]) for x in v]
                 merged_seqs_dict = _merge_chains(
                     merged_seqs_dict
                 )  # remove redundant chains
+
                 # write sequences to a fasta file for clustering with MMSeqs2, run MMSeqs2 and delete the fasta file
                 fasta_file = os.path.join(tmp_folder, "all_seqs.fasta")
                 _write_fasta(
@@ -1180,6 +1182,7 @@ def _build_dataset_partition(
             c_dict, c_pdb_dict = _read_clusters(
                 tmp_folder=tmp_folder,
                 cdr=cdr,
+                foldseek=foldseek,
             )
             clusters_dict.update(c_dict)
             clusters_pdb_dict.update(c_pdb_dict)
@@ -1373,7 +1376,6 @@ def _get_excluded_files(
                 break
 
     # return list of biounits to exclude
-    print(f"{exclude_biounits=}")
     return exclude_biounits
-Original file line number
+Diff line change
@@ Expand Up / @@ -42,6 +42,7 @@ requirements: @@
         - bs4
         - rcsbsearch
         - mmseqs2
+        - foldseek
     about:
       home: https://github.com/adaptyvbio/ProteinFlow
@@ Expand Down @@