Skip to content

Commit

Permalink
20231228
Browse files Browse the repository at this point in the history
  • Loading branch information
hiDaDeng committed Dec 28, 2023
1 parent 2a3309c commit f255e2a
Show file tree
Hide file tree
Showing 37 changed files with 2,227 additions and 9 deletions.
Binary file modified .DS_Store
Binary file not shown.
12 changes: 12 additions & 0 deletions build/lib/cntext/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
__version__ = "1.9.0"

from cntext.dictionary import SoPmi, W2VModels, co_occurrence_matrix, Glove
from cntext.similarity import jaccard_sim, minedit_sim, simple_sim, cosine_sim
from cntext.stats import load_pkl_dict, dict_pkl_list, term_freq, readability, sentiment, sentiment_by_valence, sentiment_by_weight
from cntext.mind import Text2Mind
from cntext.io import get_files, read_file, read_files, detect_encoding





543 changes: 543 additions & 0 deletions build/lib/cntext/dictionary.py

Large diffs are not rendered by default.

Binary file added build/lib/cntext/files/ADV_CONJ.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/AFINN.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/ANEW.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/ChineseEmoBank.pkl
Binary file not shown.
Binary file not shown.
Binary file added build/lib/cntext/files/Chinese_Digitalization.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/Chinese_FLS.pkl
Binary file not shown.
Binary file not shown.
Binary file added build/lib/cntext/files/Concreteness.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/DUTIR.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/HOWNET.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/HuLiu.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/LSD2015.pkl
Binary file not shown.
Binary file not shown.
Binary file added build/lib/cntext/files/NRC.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/STOPWORDS.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/geninqposneg.pkl
Binary file not shown.
Binary file added build/lib/cntext/files/sentiws.pkl
Binary file not shown.
117 changes: 117 additions & 0 deletions build/lib/cntext/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from pdfdocx import read_pdf, read_docx
import pandas as pd
import glob
import os
import chardet



def detect_encoding(file, num_lines=100):
"""
Detect encoding of file
Args:
file (str): file path
num_lines (int, optional): Defaults to 100.
Returns:
encoding type
"""
with open(file, 'rb') as f:
detector = chardet.UniversalDetector()
for line in f:
detector.feed(line)
if detector.done:
break
num_lines -= 1
if num_lines == 0:
break
detector.close()
return detector.result['encoding']



def read_file(file, encoding='utf-8', **kwargs):
"""
Read data from common format file, support .txt, .csv, .pdf, .docx, .json, .dta, etc.
Args:
file (str): file path
encoding (str, optional): Defaults to 'utf-8'.
**kwargs: other arguments for pd.read_csv, pd.read_excel, pd.read_stata, pd.read_json, pdfdocx.read_pdf, pdfdocx.read_docx
Returns:
DataFrame
"""
if '.txt' in file or '.TXT' in file:
text = open(file, 'r', encoding=encoding, **kwargs).read()
elif '.docx' in file or '.DOCX' in file:
text = read_docx(file)
elif '.pdf' in file or '.PDF' in file:
text = read_pdf(file, **kwargs)
elif '.xls' in file or '.xlsx' in file:
text = pd.read_excel(file,**kwargs)
elif '.csv' in file or '.CSV' in file:
text = pd.read_csv(file, encoding=encoding, **kwargs)
elif '.dta' in file or '.DTA' in file:
text = pd.read_stata(file, **kwargs)
elif '.json' in file or '.JSON' in file:
text = pd.read_json(file, encoding=encoding, **kwargs)
else:
print('无能为力')
text = pd.DataFrame(dict())

if type(text)!=pd.DataFrame:
df = pd.DataFrame({
'doc': text,
'file': file
}, index=[0])
else:
df = text

return df



def get_files(fformat='*.txt', recursive=True):
"""
Get a list of file path in a folder
Args:
fformat (str): filter files, the default value is '*.txt', which means this function only returns a file path list of TXT files. Defaults to '*.txt'. Other options are '*.pdf', '*.docx', '*.csv', '*.xls', '*.xlsx', '*.txt' .If you dont know the subdirectory of the folder, you can use '**/*.txt',
'**/*.pdf', '**/*.docx', '**/*.csv', '**/*.xls', '**/*.xlsx', '**/*.txt'
recursive (bool, optional): Whether to recursive search in folder. Defaults to True.
Returns:
a list of file path
"""
file_list = glob.glob(fformat, recursive=recursive)

#unify the sep
file_list = [file_path.replace('\\', '/') for file_path in file_list]
return file_list



def read_files(fformat='*.*', encoding='utf-8', recursive=True, **kwargs):
"""
Read files from specificed folder path.
Args:
fformat (str): filter files, the default value is '*.txt', which means this function only returns a file path list of TXT files. Defaults to '*.txt'. Other options are '*.pdf', '*.docx', '*.csv', '*.xls', '*.xlsx', '*.txt' , '*.dta', '*.json' .If you dont know the subdirectory of the folder, you can use '*/*.txt',
'*/*.pdf', '*/*.docx', '*/*.csv', '*/*.xls', '*/*.xlsx', '*/*.txt' , '*/*.dta' , '*/*.json'
recursive (bool, optional): Whether to recursive search in folder. Defaults to True.
Returns:
DataFrame
"""
dfs = []
files = get_files(fformat=fformat, recursive=recursive)
for file in files:
try:
dfs.append(read_file(file, encoding=encoding, **kwargs))
except:
pass
all_df = pd.concat(dfs, axis=0, ignore_index=True)
return all_df
208 changes: 208 additions & 0 deletions build/lib/cntext/mind.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import numpy as np
from numpy import dot
from numpy.linalg import norm
from time import time
from gensim.models.keyedvectors import KeyedVectors
import scipy.spatial.distance
import itertools

class Text2Mind(object):
"""
Calculate cognitive (attitude, bias) direction and strength in text
"""
def __init__(self, w2v_model_path='glove_w2v.6B.100d.txt'):
"""
Init the Text2Mind
:param w2v_model_path: pretained embedding model file path, only support word2vec format pretained model!
"""
print('Loading the model of {}'.format(w2v_model_path))
start = time()
try:
self.model = KeyedVectors.load_word2vec_format(w2v_model_path, binary=False, no_header=True)
except:
self.model = KeyedVectors.load_word2vec_format(w2v_model_path)
duration = round(time()-start, 2)
print('Load successfully, used {} s'.format(duration))

def k2v_model(self, word):
"""
return the KeyedVectors object.
"""
return self.model


def get_vector(self, word):
"""
get word vector
"""
return self.model.get_vector(word)



def __get_centroid(self, words):
"""
calculate the centroid vector of multiple word vectors
:param words: word list
:return:
"""
container = np.zeros(self.model.vector_size)
for word in words:
try:
container = container + self.model.get_vector(word)
except:
assert "No word in embeddings models"
return container / len(words)


def sematic_projection(self, words, c_words1, c_words2):
"""
Calculate the projected length of each word in the concept vector.Note that the calculation result reflects the direction of concept.
Greater than 0 means semantically closer to c_words2.
Refer "Grand, G., Blank, I.A., Pereira, F. and Fedorenko, E., 2022. Semantic projection recovers rich human knowledge of multiple object features from word embeddings. _Nature Human Behaviour_, pp.1-13."
For example, in the size concept, if you want positive means big, and negative means small,
you should set c_words1 = ["small", "little", "tiny"] c_words2 = ["large", "big", "huge"].
:param words: words list
:param c_words1: concept words1, c_words1 = ["small", "little", "tiny"]
:param c_words2: concept words2, c_words2 = ["large", "big", "huge"]
:param c_vector: concept_vector; the result of .build_concept(c_words1, c_words2)
:return:
"""
projection_scores = []
#确保词语在向量模型中
source_vector = self.__get_centroid(c_words1)
target_vector = self.__get_centroid(c_words2)
c_vector = target_vector - source_vector
concept_norm = norm(c_vector)
for word in words:
any_vector = self.model.get_vector(word)
projection_score = np.dot(any_vector, c_vector) / concept_norm
projection_scores.append(projection_score)
mean_projection_score = np.mean(projection_scores)
mean_projection_score = round(mean_projection_score, 2)
return mean_projection_score


def sematic_distance(self, words, c_words1, c_words2):
"""
Calculate the distance from words with c_words1 and c_words2 respectively, and return the difference between the two distance.
Greater than 0 means semantically closer to c_words2
:param words: words list, words = ['program', 'software', 'computer']
:param c_words1: concept words1, c_words1 = ["man", "he", "him"]
:param c_words2: concept words2, c_words2 = ["woman", "she", "her"]
:return:
"""
any_vector = self.__get_centroid(words)
c_vector1 = self.__get_centroid(c_words1)
c_vector2 = self.__get_centroid(c_words2)
dist_1 = np.linalg.norm(any_vector - c_vector1)
dist_2 = np.linalg.norm(any_vector - c_vector2)
res = dist_1-dist_2
return round(res, 2)


def divergent_association_task(words, minimum=7):
"""Compute DAT score, get the detail of algorithm, please refer to Olson, J. A., Nahas, J., Chmoulevitch, D., Cropper, S. J., & Webb, M. E. (2021). Naming unrelated words predicts creativity. Proceedings of the National Academy of Sciences, 118(25), e2022340118."""
# Keep only valid unique words
uniques = []
for word in words:
try:
self.model.get_vector(word)
uniques.append(word)
except:
pass


# Keep subset of words
if len(uniques) >= minimum:
subset = uniques[:minimum]
else:
return None # Not enough valid words

# Compute distances between each pair of words
distances = []
for word1, word2 in itertools.combinations(subset, 2):
dist = scipy.spatial.distance.cosine(self.model.get_vector(word1), self.model.get_vector(word2))
distances.append(dist)

# Compute the DAT score (average semantic distance multiplied by 100)
return (sum(distances) / len(distances)) * 100





# class Alignment(object):
# def __init__(self):
# pass



# def procrustes(A, B):
# """
# Learn the best rotation matrix to align matrix B to A
# https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem
# """
# # U, _, Vt = np.linalg.svd(B.dot(A.T))
# U, _, Vt = np.linalg.svd(B.T.dot(A))
# return U.dot(Vt)

# def intersect_vocab (idx1, idx2):
# """ Intersect the two vocabularies

# Parameters:
# ===========
# idx1 (dict): the mapping for vocabulary in the first group
# idx2 (dict): the mapping for vocabulary in the second group

# Returns:
# ========
# common_idx, common_iidx (tuple): the common mapping for vocabulary in both groups
# """
# common = idx1.keys() & idx2.keys()
# common_vocab = [v for v in common]

# common_idx, common_iidx = {v:i for i,v in enumerate (common_vocab)}, {i:v for i,v in enumerate (common_vocab)}
# return common_vocab, (common_idx, common_iidx)

# def align_matrices (mat1, mat2, idx1, idx2):
# """ Align the embedding matrices and their vocabularies.

# Parameters:
# ===========
# mat1 (numpy.ndarray): embedding matrix for first group
# mat2 (numpy.ndarray): embedding matrix for second group

# index1 (dict): the mapping dictionary for first group
# index2 (dict): the mapping dictionary for the second group

# Returns:
# ========
# remapped_mat1 (numpy.ndarray): the aligned matrix for first group
# remapped_mat2 (numpy.ndarray): the aligned matrix for second group
# common_vocab (tuple): the mapping dictionaries for both the matrices
# """
# common_vocab, (common_idx, common_iidx) = intersect_vocab (idx1, idx2)
# row_nums1 = [idx1[v] for v in common_vocab]
# row_nums2 = [idx2[v] for v in common_vocab]

# #print (len(common_vocab), len (common_idx), len (common_iidx))
# remapped_mat1 = mat1[row_nums1, :]
# remapped_mat2 = mat2[row_nums2, :]
# #print (mat1.shape, mat2.shape, remapped_mat1.shape, remapped_mat2.shape)

# omega = procrustes (remapped_mat1, remapped_mat2)
# #print (omega.shape)
# # rotated_mat2 = np.dot (omega, remapped_mat2)
# rotated_mat2 = np.dot (remapped_mat2, omega)

# return remapped_mat1, rotated_mat2, (common_idx, common_iidx)






Loading

0 comments on commit f255e2a

Please sign in to comment.