Merge branch 'main' of https://github.com/TheBotiverse/Botiverse into…

… main
TheBotiverse · Jul 24, 2023 · 847eac1 · 847eac1
2 parents b83b646 + 05b7b4f
commit 847eac1
Show file tree

Hide file tree

Showing 7 changed files with 102 additions and 14 deletions.
diff --git a/botiverse/Theorizer/__init__.py b/botiverse/Theorizer/__init__.py
@@ -1 +1,22 @@
-from botiverse.Theorizer import generate
+import os
+# from botiverse.Theorizer import generate
+import gdown
+import zipfile
+
+curr_dir = os.path.dirname(os.path.abspath(__file__))
+
+model_dir = os.path.join(curr_dir,"model")
+squad_dir = os.path.join(curr_dir,"squad")
+
+if not os.path.exists(os.path.join(squad_dir,"sample_probs.pkl")):
+    print("Sample probs not found. Downloading Theorizer sample probs...")
+    url = "https://drive.google.com/uc?id=1UjZaqM9jf9nzeK1R7WdSSLNEVKdxJOHO"
+    gdown.download(url,os.path.join(squad_dir,"sample_probs.pkl"), quiet=False)
+    print("Done.")
+
+if not os.path.exists(os.path.join(model_dir,"pretrained-model")):
+    print("Weights not found. Downloading Theorizer weights...")
+    model_path = os.path.join(model_dir,"pretrained-model")
+    url = "https://drive.google.com/drive/folders/1rUvMP1HdE_H4TAMG8HxHT6z5Y0ZOnBsg"
+    gdown.download_folder(url,output=model_path,quiet=False)
+    print("Done.")
diff --git a/botiverse/Theorizer/squad/sample_data.py b/botiverse/Theorizer/squad/sample_data.py
@@ -226,14 +226,14 @@ def read_sample_probs(sample_probs_path):
     return sample_probs
 
 def select_with_default_sampel_probs(sentence):
-    sample_probs_path = "botiverse/Theorizer/squad/dataset/train.probs.pkl"
+    sample_probs_path = "botiverse/Theorizer/squad/sample_probs.pkl"
     sample_probs = read_sample_probs(sample_probs_path)
     selection = select(
         "Bob is eating a delicious cake in Vancouver.", sample_probs)
     return selection
 
 def test():
-    sample_probs_path = "squad/dataset/train.probs.pkl"
+    sample_probs_path = "botiverse/Theorizer/squad/sample_probs.pkl"
     sample_probs = read_sample_probs(sample_probs_path)
     selection = select(
         "Bob is eating a delicious cake in Vancouver.", sample_probs)

diff --git a/botiverse/Theorizer/squad/squad_example.py b/botiverse/Theorizer/squad/squad_example.py
@@ -8,7 +8,7 @@
 import multiprocess as mp
 import math
 import numpy as np
-
+import pickle as pkl
 
 @dataclass
 class SquadExample:
@@ -81,11 +81,30 @@ def create_squad_example_with_info(raw_ex: List[SquadExample]) -> List[SquadAugm
     Augment the raw examples with question-type and clue info.
     """
 
-    examples_with_info = []
-    for e in tqdm(raw_ex):
-        new_e = extract_clue_and_question_info(
-            sentence=e.context_text, question=e.question_text, answer=e.answer_text, answer_start=e.answer_start)
-        examples_with_info.append(new_e)
+    num_process = 1
+    start_index = 0
+    end_index = len(raw_ex)
+    batch_size = len(raw_ex) // num_process
+
+
+    def task(j):
+        start = start_index + j * batch_size
+        end = min(start_index + (j + 1) * batch_size, end_index)
+        examples = []
+        e: SquadExample
+        for e in tqdm(raw_ex[start:end], desc=f"Process {j}", position=j, leave=False):
+            new_e = extract_clue_and_question_info(
+                sentence=e.context_text, question=e.question_text, answer=e.answer_text, answer_start=e.answer_start)
+            examples.append(new_e)
+        return examples
+
+    # examples_list = []
+    # with mp.Pool(num_process) as pool:
+    #     examples_list = pool.map(task, range(num_process))
+
+    examples_with_info = task(0)
+    # for e in examples_list:
+        # examples_with_info += e
 
     return examples_with_info
 
@@ -238,20 +257,20 @@ def pipeline(input_file: str):
     """
     Pipeline for processing squad examples.
     """
-    # mp.set_start_method("spawn")
+    mp.set_start_method("spawn")
 
     raw_ex = read_squad_examples(input_file)
     raw_ex = raw_ex[:1000]
 
     processed_ex = create_process_squad_examples(raw_ex)
     print(processed_ex[:6])
-    with_info_ex = create_squad_example_with_info(raw_ex)
+    with_info_ex = create_squad_example_with_info(raw_ex[:1000])
 
     sample_probs = calculate_probability_distribution(with_info_ex)
 
     return sample_probs
 
 
 if __name__ == "__main__":
-    data = pipeline("squad/dataset/train.txt")
+    data = pipeline("botiverse/Theorizer/squad/dataset/train.txt")
     print(data)
diff --git a/botiverse/Theorizer/squad/squad_sample_probs.pkl.old b/botiverse/Theorizer/squad/squad_sample_probs.pkl.old
diff --git a/botiverse/Theorizer/squad/utils.py b/botiverse/Theorizer/squad/utils.py
@@ -5,6 +5,7 @@
 import benepar
 import numpy as np
 import math
+
 NLP = spacy.load('en_core_web_sm')
 benepar.download('benepar_en3')
 PARSER = benepar.Parser("benepar_en3")
@@ -13,7 +14,7 @@
     [
         word.rstrip().lower()
         for word in open(
-            "botiverse/Theorizer/squad/info_extractor.py", "r", encoding="utf-8"
+            "botiverse/Theorizer/squad/function_words.txt", "r", encoding="utf-8"
         ).readlines()
     ]
 )

diff --git a/examples/Theorizer/example.ipynb b/examples/Theorizer/example.ipynb
@@ -0,0 +1,47 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from botiverse.Theorizer import generate\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context = \"Bob is eating a delicious cake in Vancouver.\"\n",
+    "qa_dict = generate(context)\n",
+    "print(json.dumps(qa_dict,indent=4))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/setup.py b/setup.py
@@ -41,7 +41,7 @@
         "Programming Language :: Python :: 3.9",
         "Operating System :: OS Independent"
     ],
-    packages=["botiverse", "botiverse.bots", "botiverse.models", "botiverse.preprocessors"],
+    packages=["botiverse", "botiverse.bots", "botiverse.models", "botiverse.preprocessors","botiverse.Theorizer"],
     include_package_data=True,
     install_requires=["numpy", "torch"]            # just as was in requirements.txt
 )