Skip to content

Commit

Permalink
Merge pull request #114 from adaptyvbio/fix_splitting_bug
Browse files Browse the repository at this point in the history
Fix splitting bug
  • Loading branch information
elkoz authored Sep 14, 2023
2 parents 2e98490 + df0ff7c commit 212f7e2
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 4 deletions.
1 change: 1 addition & 0 deletions .conda/default/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ requirements:
- bs4
- rcsbsearch
- mmseqs2
- foldseek

about:
home: https://github.com/adaptyvbio/ProteinFlow
Expand Down
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
FROM python:3.10-buster
RUN python -m pip install --upgrade pip setuptools wheel
RUN wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz; tar xvfz mmseqs-linux-avx2.tar.gz;
RUN wget https://mmseqs.com/foldseek/foldseek-linux-avx2.tar.gz; tar xvzf foldseek-linux-avx2.tar.gz;
RUN python -m pip install prody==2.4.0
RUN python -m pip install "rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e"
RUN python -m pip install proteinflow
RUN echo "export PATH=$(pwd)/mmseqs/bin/:$PATH" >> ~/.bashrc
RUN echo "export PATH=$(pwd)/foldseek/bin/:$PATH" >> ~/.bashrc
4 changes: 4 additions & 0 deletions proteinflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ def generate_data(
load_ligands=False,
exclude_chains_without_ligands=False,
tanimoto_clustering=False,
foldseek=False,
require_ligand=False,
random_seed=42,
max_chains=10,
Expand Down Expand Up @@ -337,6 +338,8 @@ def generate_data(
if `True`, exclude biounits that don't contain ligands
tanimoto_clustering : bool, default False
if `True`, cluster the biounits based on ligand Tanimoto similarity
foldseek : bool, default False
if `True`, cluster the biounits based on structure similarity
require_ligand : bool, default False
if `True`, only use biounits that contain a ligand
random_seed : int, default 42
Expand Down Expand Up @@ -424,6 +427,7 @@ def generate_data(
random_seed=random_seed,
exclude_chains_without_ligands=exclude_chains_without_ligands,
tanimoto_clustering=tanimoto_clustering,
foldseek=foldseek,
)
shutil.rmtree(tmp_folder)
return log_dict
Expand Down
5 changes: 5 additions & 0 deletions proteinflow/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,11 @@ def download(**kwargs):
is_flag=True,
help="Whether to use Tanimoto Clustering instead of MMSeqs2. Only works if load_ligands is set to True",
)
@click.option(
"--foldseek",
is_flag=True,
help="Whether to use FoldSeek to cluster the dataset",
)
@click.option(
"--require_ligand",
is_flag=True,
Expand Down
10 changes: 6 additions & 4 deletions proteinflow/split/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def _run_foldseek(data_folder, tmp_folder, min_seq_id):
subprocess.run(["rm", "-r", os.path.join(tmp_folder, folder, "tmp")])


def _read_clusters(tmp_folder, cdr=None):
def _read_clusters(tmp_folder, cdr=None, foldseek=False):
"""Read the output from MMSeqs2 and produces 2 dictionaries that store the clusters information.
In cluster_dict, values are the full names (pdb + chains) whereas in cluster_pdb_dict, values are just the PDB ids (so less clusters but bigger).
Expand All @@ -122,7 +122,7 @@ def _read_clusters(tmp_folder, cdr=None):
sequence_name = line[1:-1]
cluster_name = "".join(cluster_name.split(".pdb"))
sequence_name = "".join(sequence_name.split(".pdb"))
if "-" in cluster_name:
if foldseek:
cluster_name = cluster_name[:4] + cluster_name[6:]
sequence_name = sequence_name[:4] + sequence_name[6:]
if cdr is not None:
Expand All @@ -131,7 +131,7 @@ def _read_clusters(tmp_folder, cdr=None):
elif line[0] == ">":
sequence_name = line[1:-1]
sequence_name = "".join(sequence_name.split(".pdb"))
if "-" in sequence_name:
if foldseek:
sequence_name = sequence_name[:4] + sequence_name[6:]
found_header = True

Expand Down Expand Up @@ -1157,12 +1157,14 @@ def _build_dataset_partition(
merged_seqs_dict = _load_pdbs(
dataset_dir, cdr=cdr
) # keys: pdb_id, values: list of chains and sequences

lengths = []
for k, v in merged_seqs_dict.items():
lengths += [len(x[1]) for x in v]
merged_seqs_dict = _merge_chains(
merged_seqs_dict
) # remove redundant chains

# write sequences to a fasta file for clustering with MMSeqs2, run MMSeqs2 and delete the fasta file
fasta_file = os.path.join(tmp_folder, "all_seqs.fasta")
_write_fasta(
Expand All @@ -1180,6 +1182,7 @@ def _build_dataset_partition(
c_dict, c_pdb_dict = _read_clusters(
tmp_folder=tmp_folder,
cdr=cdr,
foldseek=foldseek,
)
clusters_dict.update(c_dict)
clusters_pdb_dict.update(c_pdb_dict)
Expand Down Expand Up @@ -1373,7 +1376,6 @@ def _get_excluded_files(
break

# return list of biounits to exclude
print(f"{exclude_biounits=}")
return exclude_biounits


Expand Down

0 comments on commit 212f7e2

Please sign in to comment.