-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
37 changed files
with
2,227 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
__version__ = "1.9.0" | ||
|
||
from cntext.dictionary import SoPmi, W2VModels, co_occurrence_matrix, Glove | ||
from cntext.similarity import jaccard_sim, minedit_sim, simple_sim, cosine_sim | ||
from cntext.stats import load_pkl_dict, dict_pkl_list, term_freq, readability, sentiment, sentiment_by_valence, sentiment_by_weight | ||
from cntext.mind import Text2Mind | ||
from cntext.io import get_files, read_file, read_files, detect_encoding | ||
|
||
|
||
|
||
|
||
|
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+54 KB
build/lib/cntext/files/Chinese_Loughran_McDonald_Financial_Sentiment.pkl
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from pdfdocx import read_pdf, read_docx | ||
import pandas as pd | ||
import glob | ||
import os | ||
import chardet | ||
|
||
|
||
|
||
def detect_encoding(file, num_lines=100): | ||
""" | ||
Detect encoding of file | ||
Args: | ||
file (str): file path | ||
num_lines (int, optional): Defaults to 100. | ||
Returns: | ||
encoding type | ||
""" | ||
with open(file, 'rb') as f: | ||
detector = chardet.UniversalDetector() | ||
for line in f: | ||
detector.feed(line) | ||
if detector.done: | ||
break | ||
num_lines -= 1 | ||
if num_lines == 0: | ||
break | ||
detector.close() | ||
return detector.result['encoding'] | ||
|
||
|
||
|
||
def read_file(file, encoding='utf-8', **kwargs): | ||
""" | ||
Read data from common format file, support .txt, .csv, .pdf, .docx, .json, .dta, etc. | ||
Args: | ||
file (str): file path | ||
encoding (str, optional): Defaults to 'utf-8'. | ||
**kwargs: other arguments for pd.read_csv, pd.read_excel, pd.read_stata, pd.read_json, pdfdocx.read_pdf, pdfdocx.read_docx | ||
Returns: | ||
DataFrame | ||
""" | ||
if '.txt' in file or '.TXT' in file: | ||
text = open(file, 'r', encoding=encoding, **kwargs).read() | ||
elif '.docx' in file or '.DOCX' in file: | ||
text = read_docx(file) | ||
elif '.pdf' in file or '.PDF' in file: | ||
text = read_pdf(file, **kwargs) | ||
elif '.xls' in file or '.xlsx' in file: | ||
text = pd.read_excel(file,**kwargs) | ||
elif '.csv' in file or '.CSV' in file: | ||
text = pd.read_csv(file, encoding=encoding, **kwargs) | ||
elif '.dta' in file or '.DTA' in file: | ||
text = pd.read_stata(file, **kwargs) | ||
elif '.json' in file or '.JSON' in file: | ||
text = pd.read_json(file, encoding=encoding, **kwargs) | ||
else: | ||
print('无能为力') | ||
text = pd.DataFrame(dict()) | ||
|
||
if type(text)!=pd.DataFrame: | ||
df = pd.DataFrame({ | ||
'doc': text, | ||
'file': file | ||
}, index=[0]) | ||
else: | ||
df = text | ||
|
||
return df | ||
|
||
|
||
|
||
def get_files(fformat='*.txt', recursive=True): | ||
""" | ||
Get a list of file path in a folder | ||
Args: | ||
fformat (str): filter files, the default value is '*.txt', which means this function only returns a file path list of TXT files. Defaults to '*.txt'. Other options are '*.pdf', '*.docx', '*.csv', '*.xls', '*.xlsx', '*.txt' .If you dont know the subdirectory of the folder, you can use '**/*.txt', | ||
'**/*.pdf', '**/*.docx', '**/*.csv', '**/*.xls', '**/*.xlsx', '**/*.txt' | ||
recursive (bool, optional): Whether to recursive search in folder. Defaults to True. | ||
Returns: | ||
a list of file path | ||
""" | ||
file_list = glob.glob(fformat, recursive=recursive) | ||
|
||
#unify the sep | ||
file_list = [file_path.replace('\\', '/') for file_path in file_list] | ||
return file_list | ||
|
||
|
||
|
||
def read_files(fformat='*.*', encoding='utf-8', recursive=True, **kwargs): | ||
""" | ||
Read files from specificed folder path. | ||
Args: | ||
fformat (str): filter files, the default value is '*.txt', which means this function only returns a file path list of TXT files. Defaults to '*.txt'. Other options are '*.pdf', '*.docx', '*.csv', '*.xls', '*.xlsx', '*.txt' , '*.dta', '*.json' .If you dont know the subdirectory of the folder, you can use '*/*.txt', | ||
'*/*.pdf', '*/*.docx', '*/*.csv', '*/*.xls', '*/*.xlsx', '*/*.txt' , '*/*.dta' , '*/*.json' | ||
recursive (bool, optional): Whether to recursive search in folder. Defaults to True. | ||
Returns: | ||
DataFrame | ||
""" | ||
dfs = [] | ||
files = get_files(fformat=fformat, recursive=recursive) | ||
for file in files: | ||
try: | ||
dfs.append(read_file(file, encoding=encoding, **kwargs)) | ||
except: | ||
pass | ||
all_df = pd.concat(dfs, axis=0, ignore_index=True) | ||
return all_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
import numpy as np | ||
from numpy import dot | ||
from numpy.linalg import norm | ||
from time import time | ||
from gensim.models.keyedvectors import KeyedVectors | ||
import scipy.spatial.distance | ||
import itertools | ||
|
||
class Text2Mind(object): | ||
""" | ||
Calculate cognitive (attitude, bias) direction and strength in text | ||
""" | ||
def __init__(self, w2v_model_path='glove_w2v.6B.100d.txt'): | ||
""" | ||
Init the Text2Mind | ||
:param w2v_model_path: pretained embedding model file path, only support word2vec format pretained model! | ||
""" | ||
print('Loading the model of {}'.format(w2v_model_path)) | ||
start = time() | ||
try: | ||
self.model = KeyedVectors.load_word2vec_format(w2v_model_path, binary=False, no_header=True) | ||
except: | ||
self.model = KeyedVectors.load_word2vec_format(w2v_model_path) | ||
duration = round(time()-start, 2) | ||
print('Load successfully, used {} s'.format(duration)) | ||
|
||
def k2v_model(self, word): | ||
""" | ||
return the KeyedVectors object. | ||
""" | ||
return self.model | ||
|
||
|
||
def get_vector(self, word): | ||
""" | ||
get word vector | ||
""" | ||
return self.model.get_vector(word) | ||
|
||
|
||
|
||
def __get_centroid(self, words): | ||
""" | ||
calculate the centroid vector of multiple word vectors | ||
:param words: word list | ||
:return: | ||
""" | ||
container = np.zeros(self.model.vector_size) | ||
for word in words: | ||
try: | ||
container = container + self.model.get_vector(word) | ||
except: | ||
assert "No word in embeddings models" | ||
return container / len(words) | ||
|
||
|
||
def sematic_projection(self, words, c_words1, c_words2): | ||
""" | ||
Calculate the projected length of each word in the concept vector.Note that the calculation result reflects the direction of concept. | ||
Greater than 0 means semantically closer to c_words2. | ||
Refer "Grand, G., Blank, I.A., Pereira, F. and Fedorenko, E., 2022. Semantic projection recovers rich human knowledge of multiple object features from word embeddings. _Nature Human Behaviour_, pp.1-13." | ||
For example, in the size concept, if you want positive means big, and negative means small, | ||
you should set c_words1 = ["small", "little", "tiny"] c_words2 = ["large", "big", "huge"]. | ||
:param words: words list | ||
:param c_words1: concept words1, c_words1 = ["small", "little", "tiny"] | ||
:param c_words2: concept words2, c_words2 = ["large", "big", "huge"] | ||
:param c_vector: concept_vector; the result of .build_concept(c_words1, c_words2) | ||
:return: | ||
""" | ||
projection_scores = [] | ||
#确保词语在向量模型中 | ||
source_vector = self.__get_centroid(c_words1) | ||
target_vector = self.__get_centroid(c_words2) | ||
c_vector = target_vector - source_vector | ||
concept_norm = norm(c_vector) | ||
for word in words: | ||
any_vector = self.model.get_vector(word) | ||
projection_score = np.dot(any_vector, c_vector) / concept_norm | ||
projection_scores.append(projection_score) | ||
mean_projection_score = np.mean(projection_scores) | ||
mean_projection_score = round(mean_projection_score, 2) | ||
return mean_projection_score | ||
|
||
|
||
def sematic_distance(self, words, c_words1, c_words2): | ||
""" | ||
Calculate the distance from words with c_words1 and c_words2 respectively, and return the difference between the two distance. | ||
Greater than 0 means semantically closer to c_words2 | ||
:param words: words list, words = ['program', 'software', 'computer'] | ||
:param c_words1: concept words1, c_words1 = ["man", "he", "him"] | ||
:param c_words2: concept words2, c_words2 = ["woman", "she", "her"] | ||
:return: | ||
""" | ||
any_vector = self.__get_centroid(words) | ||
c_vector1 = self.__get_centroid(c_words1) | ||
c_vector2 = self.__get_centroid(c_words2) | ||
dist_1 = np.linalg.norm(any_vector - c_vector1) | ||
dist_2 = np.linalg.norm(any_vector - c_vector2) | ||
res = dist_1-dist_2 | ||
return round(res, 2) | ||
|
||
|
||
def divergent_association_task(words, minimum=7): | ||
"""Compute DAT score, get the detail of algorithm, please refer to Olson, J. A., Nahas, J., Chmoulevitch, D., Cropper, S. J., & Webb, M. E. (2021). Naming unrelated words predicts creativity. Proceedings of the National Academy of Sciences, 118(25), e2022340118.""" | ||
# Keep only valid unique words | ||
uniques = [] | ||
for word in words: | ||
try: | ||
self.model.get_vector(word) | ||
uniques.append(word) | ||
except: | ||
pass | ||
|
||
|
||
# Keep subset of words | ||
if len(uniques) >= minimum: | ||
subset = uniques[:minimum] | ||
else: | ||
return None # Not enough valid words | ||
|
||
# Compute distances between each pair of words | ||
distances = [] | ||
for word1, word2 in itertools.combinations(subset, 2): | ||
dist = scipy.spatial.distance.cosine(self.model.get_vector(word1), self.model.get_vector(word2)) | ||
distances.append(dist) | ||
|
||
# Compute the DAT score (average semantic distance multiplied by 100) | ||
return (sum(distances) / len(distances)) * 100 | ||
|
||
|
||
|
||
|
||
|
||
# class Alignment(object): | ||
# def __init__(self): | ||
# pass | ||
|
||
|
||
|
||
# def procrustes(A, B): | ||
# """ | ||
# Learn the best rotation matrix to align matrix B to A | ||
# https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem | ||
# """ | ||
# # U, _, Vt = np.linalg.svd(B.dot(A.T)) | ||
# U, _, Vt = np.linalg.svd(B.T.dot(A)) | ||
# return U.dot(Vt) | ||
|
||
# def intersect_vocab (idx1, idx2): | ||
# """ Intersect the two vocabularies | ||
|
||
# Parameters: | ||
# =========== | ||
# idx1 (dict): the mapping for vocabulary in the first group | ||
# idx2 (dict): the mapping for vocabulary in the second group | ||
|
||
# Returns: | ||
# ======== | ||
# common_idx, common_iidx (tuple): the common mapping for vocabulary in both groups | ||
# """ | ||
# common = idx1.keys() & idx2.keys() | ||
# common_vocab = [v for v in common] | ||
|
||
# common_idx, common_iidx = {v:i for i,v in enumerate (common_vocab)}, {i:v for i,v in enumerate (common_vocab)} | ||
# return common_vocab, (common_idx, common_iidx) | ||
|
||
# def align_matrices (mat1, mat2, idx1, idx2): | ||
# """ Align the embedding matrices and their vocabularies. | ||
|
||
# Parameters: | ||
# =========== | ||
# mat1 (numpy.ndarray): embedding matrix for first group | ||
# mat2 (numpy.ndarray): embedding matrix for second group | ||
|
||
# index1 (dict): the mapping dictionary for first group | ||
# index2 (dict): the mapping dictionary for the second group | ||
|
||
# Returns: | ||
# ======== | ||
# remapped_mat1 (numpy.ndarray): the aligned matrix for first group | ||
# remapped_mat2 (numpy.ndarray): the aligned matrix for second group | ||
# common_vocab (tuple): the mapping dictionaries for both the matrices | ||
# """ | ||
# common_vocab, (common_idx, common_iidx) = intersect_vocab (idx1, idx2) | ||
# row_nums1 = [idx1[v] for v in common_vocab] | ||
# row_nums2 = [idx2[v] for v in common_vocab] | ||
|
||
# #print (len(common_vocab), len (common_idx), len (common_iidx)) | ||
# remapped_mat1 = mat1[row_nums1, :] | ||
# remapped_mat2 = mat2[row_nums2, :] | ||
# #print (mat1.shape, mat2.shape, remapped_mat1.shape, remapped_mat2.shape) | ||
|
||
# omega = procrustes (remapped_mat1, remapped_mat2) | ||
# #print (omega.shape) | ||
# # rotated_mat2 = np.dot (omega, remapped_mat2) | ||
# rotated_mat2 = np.dot (remapped_mat2, omega) | ||
|
||
# return remapped_mat1, rotated_mat2, (common_idx, common_iidx) | ||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.