-
Notifications
You must be signed in to change notification settings - Fork 0
/
inverse_index.py
118 lines (87 loc) · 3.37 KB
/
inverse_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import pandas as pd
import math
import pickle
import dataCleanupPart1
if not os.path.isfile("cleaned_data.csv"):
print("Creating cleaned dataset...")
dataCleanupPart1.clean_dataset()
def get_number_of_docs():
df = pd.read_csv("cleaned_data.csv")
return len(df)
def create_inverse_index_catalogue():
df = pd.read_csv("cleaned_data.csv")
"""
Create an inverse index catalogue and save it as a pickle file.
The inverse index catalogue maps words to a list of documents containing the word and their term frequency.
"""
inverse_index_catalogue = {}
for index, row in df.iterrows():
speech = str(row["speech"]) if pd.notna(row["speech"]) else ""
speech = speech.split(" ")
doc_id = row["doc_id"]
for word in speech:
if word in inverse_index_catalogue:
word_list = inverse_index_catalogue.get(word)
if doc_id in word_list:
word_list[doc_id] += 1
else:
word_list[doc_id] = 1
inverse_index_catalogue[word] = word_list
else:
inverse_index_catalogue[word] = {doc_id: 1}
if index % 100000 == 0:
print(f"Processed {index} documents")
# print(inverse_index_catalogue)
print("I am done")
with open('inverse_index.pkl', 'wb') as file:
pickle.dump(inverse_index_catalogue, file)
return
def calculate_tf_idf_similarity(cleaned_query: list) -> list:
"""
Calculate TF-IDF similarity scores between the query and documents.
Parameters:
cleaned_query (list): List of cleaned and stemmed words in the query.
Returns:
list: List of TF-IDF similarity scores for each document.
"""
df = pd.read_csv("cleaned_data.csv")
if not os.path.isfile("inverse_index.pkl"):
create_inverse_index_catalogue()
print("Creating the inverse index")
print("Here")
with open("inverse_index.pkl", 'rb') as file:
inverse_index_catalogue = pickle.load(file)
print("Opened and continuing the work")
print(len(inverse_index_catalogue))
NUMBER_OF_DOCS = get_number_of_docs()
accumulators = [0] * NUMBER_OF_DOCS
ld = [0] * NUMBER_OF_DOCS
print("Initialized accumulators")
for word in cleaned_query:
if word in inverse_index_catalogue:
word_list = inverse_index_catalogue[word]
nt = len(word_list)
idft = math.log(1 + (NUMBER_OF_DOCS / nt))
for doc_id, tf in word_list.items():
tf = 1 + math.log(tf)
accumulators[doc_id] += idft * tf
else:
continue
for i in range(0, NUMBER_OF_DOCS):
if accumulators[i] == 0:
continue
else:
speech = str(df["speech"][i]) if pd.notna(df["speech"][i]) else ""
speech = speech.split(" ")
if len(speech) < 15:
accumulators[i] = 0
continue
for word in speech:
word_list = inverse_index_catalogue[word]
nt = len(word_list)
idft = math.log(1 + (NUMBER_OF_DOCS / nt))
tf = 1 + math.log(word_list[i])
ld[i] += (tf*idft)**2
accumulators[i] = accumulators[i] / math.sqrt(ld[i])
return accumulators