v2.0.0

AstraBert · May 23, 2024 · e7cd60e · e7cd60e
1 parent ea01ef0
commit e7cd60e
Show file tree

Hide file tree

Showing 12 changed files with 623 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -57,6 +57,12 @@ Choose the task among:
 - *image-generation-pollinations*: stable diffusion, use Pollinations AI API; if you choose 'image-generation-pollinations', you do not need to specify anything else apart from the task - **MULTILINGUAL**
 - *image-classification*: classify an image, supports every image-classification model on HF Hub - **ENGLISH ONLY**
 - *image-to-text*:  describe an image, supports every image-to-text model on HF Hub - **ENGLISH ONLY**
+- *audio-classification*: classify audio files or microphone recordings, supports audio-classification models on HF hub
+- *speech-recognition*: transcribe audio files or microphone recordings, supports automatic-speech-recognition models on HF hub.
+- *video-generation*: generate video upon text prompt, supports text-to-video models on HF hub - **ENGLISH ONLY**
+- *protein-folding*: get the 3D structure of a protein from its amino-acid sequence, using ESM-2 backbone model - **GPU ONLY**
+- *autotrain*: fine-tune a model on a specific downstream task with autotrain-advanced, just by specifying you HF username, HF writing token and the path to a yaml config file for the training
+- *spaces-api-supabase*: use HF Spaces API in combination with Supabase PostgreSQL databases in order to unleash more powerful LLMs and larger RAG-oriented vector databases - **MULTILINGUAL**
 - *image-retrieval-search*: search an image database uploading a folder as database input. The folder should have the following structure:
 
 ```
@@ -68,6 +74,8 @@ Choose the task among:
     ├── label1/
     └── label2/
 ```
+
+
 You can query the database starting from your own pictures.
 
 ### 6. Go to `localhost:7860` and start using your assistant

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -7,8 +7,8 @@ WORKDIR /app
 # Add the current directory contents into the container at /app
 ADD . /app
 
-# Add new package
-RUN python3 -m pip install datasets==2.15.0
+#Upgrade gradio
+RUN pip install autotrain-advanced
 
 # Expose the port that the application will run on
 EXPOSE 8760

diff --git a/docker/audio_classification.py b/docker/audio_classification.py
@@ -0,0 +1,50 @@
+from transformers import pipeline
+from argparse import ArgumentParser
+import torch
+import gradio as gr
+import numpy as np
+
+argparse = ArgumentParser()
+argparse.add_argument(
+    "-m",
+    "--model",
+    help="HuggingFace Model identifier, such as 'google/flan-t5-base'",
+    required=True,
+)
+
+args = argparse.parse_args()
+
+
+mod = args.model
+mod = mod.replace("\"", "").replace("'", "")
+
+model_checkpoint = mod
+
+# Audio class
+classifier = pipeline(task="audio-classification", model=mod)
+
+def classify_text(audio):
+    global classifier
+    sr, data = audio
+    short_tensor = data.astype(np.float32)
+    res = classifier(short_tensor)
+    return res[0]["label"]
+
+input_audio = gr.Audio(
+    sources=["upload","microphone"],
+    waveform_options=gr.WaveformOptions(
+        waveform_color="#01C6FF",
+        waveform_progress_color="#0066B4",
+        skip_length=2,
+        show_controls=False,
+    ),
+)
+demo = gr.Interface(
+    title="everything-ai-audioclass",
+    fn=classify_text,
+    inputs=input_audio,
+    outputs="text"
+)
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", share=False)
diff --git a/docker/autotrain_interface.py b/docker/autotrain_interface.py
@@ -0,0 +1,43 @@
+import subprocess as sp
+import gradio as gr
+import subprocess as sp
+
+
+def build_command(hf_usr, hf_token, configpath):
+    sp.run(f"export HF_USERNAME=\"{hf_usr}\"", shell=True)
+    sp.run(f"export HF_TOKEN=\"{hf_token}\"", shell=True)
+    sp.run(f"autotrain --config {configpath}", shell=True)
+    return f"export HF_USERNAME={hf_usr}\nexport HF_TOKEN={hf_token}\nautotrain --config {configpath}"
+
+
+demo = gr.Interface(
+    build_command,
+    [
+        gr.Textbox(
+            label="HF username",
+            info="Your HF username",
+            lines=3,
+            value=f"your-cute-name",
+        ),
+        gr.Textbox(
+            label="HF write token",
+            info="An HF token that has write permissions on your repository",
+            lines=3,
+            value=f"your-powerful-token",
+        ),
+        gr.Textbox(
+            label="Yaml configuration file",
+            info="Path to the yaml configuration file containing the information to use autotrain",
+            lines=3,
+            value="/path/to/config.yaml",
+        )
+    ],
+    title="everything-ai-autotrain",
+    outputs="textbox",
+    theme=gr.themes.Base()
+)
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
+
+
diff --git a/docker/protein_folding_with_esm.py b/docker/protein_folding_with_esm.py
@@ -0,0 +1,60 @@
+from transformers import AutoTokenizer, EsmForProteinFolding
+from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
+from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
+from proteins_viz import *
+import gradio as gr
+
+def convert_outputs_to_pdb(outputs):
+    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
+    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
+    final_atom_positions = final_atom_positions.cpu().numpy()
+    final_atom_mask = outputs["atom37_atom_exists"]
+    pdbs = []
+    for i in range(outputs["aatype"].shape[0]):
+        aa = outputs["aatype"][i]
+        pred_pos = final_atom_positions[i]
+        mask = final_atom_mask[i]
+        resid = outputs["residue_index"][i] + 1
+        pred = OFProtein(
+            aatype=aa,
+            atom_positions=pred_pos,
+            atom_mask=mask,
+            residue_index=resid,
+            b_factors=outputs["plddt"][i],
+            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
+        )
+        pdbs.append(to_pdb(pred))
+    return pdbs
+
+tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
+model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)
+
+model = model.cuda()
+
+model.esm = model.esm.half()
+
+import torch
+
+torch.backends.cuda.matmul.allow_tf32 = True
+
+model.trunk.set_chunk_size(64)
+
+def fold_protein(test_protein):
+    tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']
+    tokenized_input = tokenized_input.cuda()
+    with torch.no_grad():
+        output = model(tokenized_input)
+    pdb = convert_outputs_to_pdb(output)
+    with open("output_structure.pdb", "w") as f:
+        f.write("".join(pdb))
+    image = take_care("output_structure.pdb")
+    return image
+
+iface = gr.Interface(
+    title="everything-ai-proteinfold",
+    fn=fold_protein,
+    inputs="text",
+    outputs="image", 
+)
+
+iface.launch(server_name="0.0.0.0", share=False)
diff --git a/docker/proteins_viz.py b/docker/proteins_viz.py
@@ -0,0 +1,135 @@
+import pandas as pd
+from biopandas.pdb import PandasPdb
+from prody import parsePDBHeader
+
+
+
+
+def read_pdb_to_dataframe(
+    pdb_path,
+    model_index: int = 1,
+    parse_header: bool = True,
+    ) -> pd.DataFrame:
+    """
+    Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.
+
+    Args:
+        pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
+        model_index (int, optional): Index of the model to extract from the PDB file, in case
+            it contains multiple models. Defaults to 1.
+        parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
+            Defaults to True.
+
+    Returns:
+        pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
+            per atom
+    """
+    atomic_df = PandasPdb().read_pdb(pdb_path)
+    if parse_header:
+        header = parsePDBHeader(pdb_path)
+    else:
+        header = None
+    atomic_df = atomic_df.get_model(model_index)
+    if len(atomic_df.df["ATOM"]) == 0:
+        raise ValueError(f"No model found for index: {model_index}")
+
+    return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header
+
+from graphein.protein.graphs import label_node_id
+
+def process_dataframe(df: pd.DataFrame, granularity='CA') -> pd.DataFrame:
+    """
+    Process a DataFrame of protein structure data to reduce ambiguity and simplify analysis.
+
+    This function performs the following steps:
+    1. Handles alternate locations for an atom, defaulting to keep the first one if multiple exist.
+    2. Assigns a unique node_id to each residue in the DataFrame, using a helper function label_node_id.
+    3. Filters the DataFrame based on specified granularity (defaults to 'CA' for alpha carbon).
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame containing protein structure data to process. It is expected to contain columns 'alt_loc' and 'atom_name'.
+        
+    granularity : str, optional
+        The level of detail or perspective at which the DataFrame should be analyzed. Defaults to 'CA' (alpha carbon).
+    """
+    # handle the case of alternative locations,
+    # if so default to the 1st one = A
+    if 'alt_loc' in df.columns:
+      df['alt_loc'] = df['alt_loc'].replace('', 'A')
+      df = df.loc[(df['alt_loc']=='A')]
+    df = label_node_id(df, granularity)
+    df = df.loc[(df['atom_name']==granularity)]
+    return df
+
+
+from graphein.protein.graphs import initialise_graph_with_metadata
+from graphein.protein.graphs import add_nodes_to_graph
+from graphein.protein.visualisation import plotly_protein_structure_graph
+from PIL import Image
+import networkx as nx
+
+def take_care(pdb_path):
+
+
+    df, header = read_pdb_to_dataframe(pdb_path)
+    process_df = process_dataframe(df)
+
+    g = initialise_graph_with_metadata(protein_df=process_df, # from above cell
+                                        raw_pdb_df=df, # Store this for traceability
+                                        pdb_code = '3nir', #and again
+                                        granularity = 'CA' # Store this so we know what kind of graph we have
+                                        )
+    g = add_nodes_to_graph(g)
+
+
+    def add_backbone_edges(G: nx.Graph) -> nx.Graph:
+        # Iterate over every chain
+        for chain_id in G.graph["chain_ids"]:
+            # Find chain residues
+            chain_residues = [
+                (n, v) for n, v in G.nodes(data=True) if v["chain_id"] == chain_id
+            ]
+            # Iterate over every residue in chain
+            for i, residue in enumerate(chain_residues):
+                try:
+                    # Checks not at chain terminus
+                    if i == len(chain_residues) - 1:
+                        continue
+                    # Asserts residues are on the same chain
+                    cond_1 = ( residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"])
+                    # Asserts residue numbers are adjacent
+                    cond_2 = (abs(residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"])== 1)
+
+                    # If this checks out, we add a peptide bond
+                    if (cond_1) and (cond_2):
+                        # Adds "peptide bond" between current residue and the next
+                        if G.has_edge(i, i + 1):
+                            G.edges[i, i + 1]["kind"].add('backbone_bond')
+                        else:
+                            G.add_edge(residue[0],chain_residues[i + 1][0],kind={'backbone_bond'},)
+                except IndexError as e:
+                    print(e)
+        return G
+
+    g = add_backbone_edges(g)
+
+
+
+    p = plotly_protein_structure_graph(
+        g,
+        colour_edges_by="kind",
+        colour_nodes_by="seq_position",
+        label_node_ids=False,
+        plot_title="Backbone Protein Graph",
+        node_size_multiplier=1,
+    )
+    image_file = "protein_graph.png"
+    p.write_image(image_file, format='png')
+
+
+    # Load the PNG image into a PIL image
+    image = Image.open(image_file)
+
+    return image