Skip to content

Commit

Permalink
v2.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
AstraBert committed May 23, 2024
1 parent ea01ef0 commit e7cd60e
Show file tree
Hide file tree
Showing 12 changed files with 623 additions and 10 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ Choose the task among:
- *image-generation-pollinations*: stable diffusion, use Pollinations AI API; if you choose 'image-generation-pollinations', you do not need to specify anything else apart from the task - **MULTILINGUAL**
- *image-classification*: classify an image, supports every image-classification model on HF Hub - **ENGLISH ONLY**
- *image-to-text*: describe an image, supports every image-to-text model on HF Hub - **ENGLISH ONLY**
- *audio-classification*: classify audio files or microphone recordings, supports audio-classification models on HF hub
- *speech-recognition*: transcribe audio files or microphone recordings, supports automatic-speech-recognition models on HF hub.
- *video-generation*: generate video upon text prompt, supports text-to-video models on HF hub - **ENGLISH ONLY**
- *protein-folding*: get the 3D structure of a protein from its amino-acid sequence, using ESM-2 backbone model - **GPU ONLY**
- *autotrain*: fine-tune a model on a specific downstream task with autotrain-advanced, just by specifying you HF username, HF writing token and the path to a yaml config file for the training
- *spaces-api-supabase*: use HF Spaces API in combination with Supabase PostgreSQL databases in order to unleash more powerful LLMs and larger RAG-oriented vector databases - **MULTILINGUAL**
- *image-retrieval-search*: search an image database uploading a folder as database input. The folder should have the following structure:

```
Expand All @@ -68,6 +74,8 @@ Choose the task among:
├── label1/
└── label2/
```


You can query the database starting from your own pictures.

### 6. Go to `localhost:7860` and start using your assistant
Expand Down
4 changes: 2 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ WORKDIR /app
# Add the current directory contents into the container at /app
ADD . /app

# Add new package
RUN python3 -m pip install datasets==2.15.0
#Upgrade gradio
RUN pip install autotrain-advanced

# Expose the port that the application will run on
EXPOSE 8760
Expand Down
50 changes: 50 additions & 0 deletions docker/audio_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from transformers import pipeline
from argparse import ArgumentParser
import torch
import gradio as gr
import numpy as np

argparse = ArgumentParser()
argparse.add_argument(
"-m",
"--model",
help="HuggingFace Model identifier, such as 'google/flan-t5-base'",
required=True,
)

args = argparse.parse_args()


mod = args.model
mod = mod.replace("\"", "").replace("'", "")

model_checkpoint = mod

# Audio class
classifier = pipeline(task="audio-classification", model=mod)

def classify_text(audio):
global classifier
sr, data = audio
short_tensor = data.astype(np.float32)
res = classifier(short_tensor)
return res[0]["label"]

input_audio = gr.Audio(
sources=["upload","microphone"],
waveform_options=gr.WaveformOptions(
waveform_color="#01C6FF",
waveform_progress_color="#0066B4",
skip_length=2,
show_controls=False,
),
)
demo = gr.Interface(
title="everything-ai-audioclass",
fn=classify_text,
inputs=input_audio,
outputs="text"
)

if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", share=False)
43 changes: 43 additions & 0 deletions docker/autotrain_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import subprocess as sp
import gradio as gr
import subprocess as sp


def build_command(hf_usr, hf_token, configpath):
sp.run(f"export HF_USERNAME=\"{hf_usr}\"", shell=True)
sp.run(f"export HF_TOKEN=\"{hf_token}\"", shell=True)
sp.run(f"autotrain --config {configpath}", shell=True)
return f"export HF_USERNAME={hf_usr}\nexport HF_TOKEN={hf_token}\nautotrain --config {configpath}"


demo = gr.Interface(
build_command,
[
gr.Textbox(
label="HF username",
info="Your HF username",
lines=3,
value=f"your-cute-name",
),
gr.Textbox(
label="HF write token",
info="An HF token that has write permissions on your repository",
lines=3,
value=f"your-powerful-token",
),
gr.Textbox(
label="Yaml configuration file",
info="Path to the yaml configuration file containing the information to use autotrain",
lines=3,
value="/path/to/config.yaml",
)
],
title="everything-ai-autotrain",
outputs="textbox",
theme=gr.themes.Base()
)

if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)


60 changes: 60 additions & 0 deletions docker/protein_folding_with_esm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from transformers import AutoTokenizer, EsmForProteinFolding
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
from proteins_viz import *
import gradio as gr

def convert_outputs_to_pdb(outputs):
final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
final_atom_positions = final_atom_positions.cpu().numpy()
final_atom_mask = outputs["atom37_atom_exists"]
pdbs = []
for i in range(outputs["aatype"].shape[0]):
aa = outputs["aatype"][i]
pred_pos = final_atom_positions[i]
mask = final_atom_mask[i]
resid = outputs["residue_index"][i] + 1
pred = OFProtein(
aatype=aa,
atom_positions=pred_pos,
atom_mask=mask,
residue_index=resid,
b_factors=outputs["plddt"][i],
chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
)
pdbs.append(to_pdb(pred))
return pdbs

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

model = model.cuda()

model.esm = model.esm.half()

import torch

torch.backends.cuda.matmul.allow_tf32 = True

model.trunk.set_chunk_size(64)

def fold_protein(test_protein):
tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']
tokenized_input = tokenized_input.cuda()
with torch.no_grad():
output = model(tokenized_input)
pdb = convert_outputs_to_pdb(output)
with open("output_structure.pdb", "w") as f:
f.write("".join(pdb))
image = take_care("output_structure.pdb")
return image

iface = gr.Interface(
title="everything-ai-proteinfold",
fn=fold_protein,
inputs="text",
outputs="image",
)

iface.launch(server_name="0.0.0.0", share=False)
135 changes: 135 additions & 0 deletions docker/proteins_viz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import pandas as pd
from biopandas.pdb import PandasPdb
from prody import parsePDBHeader




def read_pdb_to_dataframe(
pdb_path,
model_index: int = 1,
parse_header: bool = True,
) -> pd.DataFrame:
"""
Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.
Args:
pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
model_index (int, optional): Index of the model to extract from the PDB file, in case
it contains multiple models. Defaults to 1.
parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
Defaults to True.
Returns:
pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
per atom
"""
atomic_df = PandasPdb().read_pdb(pdb_path)
if parse_header:
header = parsePDBHeader(pdb_path)
else:
header = None
atomic_df = atomic_df.get_model(model_index)
if len(atomic_df.df["ATOM"]) == 0:
raise ValueError(f"No model found for index: {model_index}")

return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header

from graphein.protein.graphs import label_node_id

def process_dataframe(df: pd.DataFrame, granularity='CA') -> pd.DataFrame:
"""
Process a DataFrame of protein structure data to reduce ambiguity and simplify analysis.
This function performs the following steps:
1. Handles alternate locations for an atom, defaulting to keep the first one if multiple exist.
2. Assigns a unique node_id to each residue in the DataFrame, using a helper function label_node_id.
3. Filters the DataFrame based on specified granularity (defaults to 'CA' for alpha carbon).
Parameters
----------
df : pd.DataFrame
The DataFrame containing protein structure data to process. It is expected to contain columns 'alt_loc' and 'atom_name'.
granularity : str, optional
The level of detail or perspective at which the DataFrame should be analyzed. Defaults to 'CA' (alpha carbon).
"""
# handle the case of alternative locations,
# if so default to the 1st one = A
if 'alt_loc' in df.columns:
df['alt_loc'] = df['alt_loc'].replace('', 'A')
df = df.loc[(df['alt_loc']=='A')]
df = label_node_id(df, granularity)
df = df.loc[(df['atom_name']==granularity)]
return df


from graphein.protein.graphs import initialise_graph_with_metadata
from graphein.protein.graphs import add_nodes_to_graph
from graphein.protein.visualisation import plotly_protein_structure_graph
from PIL import Image
import networkx as nx

def take_care(pdb_path):


df, header = read_pdb_to_dataframe(pdb_path)
process_df = process_dataframe(df)

g = initialise_graph_with_metadata(protein_df=process_df, # from above cell
raw_pdb_df=df, # Store this for traceability
pdb_code = '3nir', #and again
granularity = 'CA' # Store this so we know what kind of graph we have
)
g = add_nodes_to_graph(g)


def add_backbone_edges(G: nx.Graph) -> nx.Graph:
# Iterate over every chain
for chain_id in G.graph["chain_ids"]:
# Find chain residues
chain_residues = [
(n, v) for n, v in G.nodes(data=True) if v["chain_id"] == chain_id
]
# Iterate over every residue in chain
for i, residue in enumerate(chain_residues):
try:
# Checks not at chain terminus
if i == len(chain_residues) - 1:
continue
# Asserts residues are on the same chain
cond_1 = ( residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"])
# Asserts residue numbers are adjacent
cond_2 = (abs(residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"])== 1)

# If this checks out, we add a peptide bond
if (cond_1) and (cond_2):
# Adds "peptide bond" between current residue and the next
if G.has_edge(i, i + 1):
G.edges[i, i + 1]["kind"].add('backbone_bond')
else:
G.add_edge(residue[0],chain_residues[i + 1][0],kind={'backbone_bond'},)
except IndexError as e:
print(e)
return G

g = add_backbone_edges(g)



p = plotly_protein_structure_graph(
g,
colour_edges_by="kind",
colour_nodes_by="seq_position",
label_node_ids=False,
plot_title="Backbone Protein Graph",
node_size_multiplier=1,
)
image_file = "protein_graph.png"
p.write_image(image_file, format='png')


# Load the PNG image into a PIL image
image = Image.open(image_file)

return image
Loading

0 comments on commit e7cd60e

Please sign in to comment.