20231228

hiDaDeng · Dec 28, 2023 · f255e2a · f255e2a
1 parent 2a3309c
commit f255e2a
Show file tree

Hide file tree

Showing 37 changed files with 2,227 additions and 9 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/build/lib/cntext/__init__.py b/build/lib/cntext/__init__.py
@@ -0,0 +1,12 @@
+__version__ = "1.9.0"
+
+from cntext.dictionary import SoPmi, W2VModels, co_occurrence_matrix, Glove
+from cntext.similarity import jaccard_sim, minedit_sim, simple_sim, cosine_sim
+from cntext.stats import load_pkl_dict, dict_pkl_list, term_freq, readability, sentiment, sentiment_by_valence, sentiment_by_weight
+from cntext.mind import Text2Mind
+from cntext.io import get_files, read_file, read_files, detect_encoding
+
+
+
+
+
diff --git a/build/lib/cntext/dictionary.py b/build/lib/cntext/dictionary.py
diff --git a/build/lib/cntext/files/ADV_CONJ.pkl b/build/lib/cntext/files/ADV_CONJ.pkl
diff --git a/build/lib/cntext/files/AFINN.pkl b/build/lib/cntext/files/AFINN.pkl
diff --git a/build/lib/cntext/files/ANEW.pkl b/build/lib/cntext/files/ANEW.pkl
diff --git a/build/lib/cntext/files/ChineseEmoBank.pkl b/build/lib/cntext/files/ChineseEmoBank.pkl
diff --git a/build/lib/cntext/files/ChineseFinancialFormalUnformalSentiment.pkl b/build/lib/cntext/files/ChineseFinancialFormalUnformalSentiment.pkl
diff --git a/build/lib/cntext/files/Chinese_Digitalization.pkl b/build/lib/cntext/files/Chinese_Digitalization.pkl
diff --git a/build/lib/cntext/files/Chinese_FLS.pkl b/build/lib/cntext/files/Chinese_FLS.pkl
diff --git a/build/lib/cntext/files/Chinese_Loughran_McDonald_Financial_Sentiment.pkl b/build/lib/cntext/files/Chinese_Loughran_McDonald_Financial_Sentiment.pkl
diff --git a/build/lib/cntext/files/Concreteness.pkl b/build/lib/cntext/files/Concreteness.pkl
diff --git a/build/lib/cntext/files/DUTIR.pkl b/build/lib/cntext/files/DUTIR.pkl
diff --git a/build/lib/cntext/files/HOWNET.pkl b/build/lib/cntext/files/HOWNET.pkl
diff --git a/build/lib/cntext/files/HuLiu.pkl b/build/lib/cntext/files/HuLiu.pkl
diff --git a/build/lib/cntext/files/LSD2015.pkl b/build/lib/cntext/files/LSD2015.pkl
diff --git a/build/lib/cntext/files/Loughran_McDonald_Financial_Sentiment.pkl b/build/lib/cntext/files/Loughran_McDonald_Financial_Sentiment.pkl
diff --git a/build/lib/cntext/files/NRC.pkl b/build/lib/cntext/files/NRC.pkl
diff --git a/build/lib/cntext/files/STOPWORDS.pkl b/build/lib/cntext/files/STOPWORDS.pkl
diff --git a/build/lib/cntext/files/geninqposneg.pkl b/build/lib/cntext/files/geninqposneg.pkl
diff --git a/build/lib/cntext/files/sentiws.pkl b/build/lib/cntext/files/sentiws.pkl
diff --git a/build/lib/cntext/io.py b/build/lib/cntext/io.py
@@ -0,0 +1,117 @@
+from pdfdocx import read_pdf, read_docx
+import pandas as pd
+import glob
+import os
+import chardet
+
+
+
+def detect_encoding(file, num_lines=100):
+    """
+    Detect encoding of file
+
+    Args:
+        file (str): file path
+        num_lines (int, optional):  Defaults to 100.
+
+    Returns:
+        encoding type
+    """
+    with open(file, 'rb') as f:
+        detector = chardet.UniversalDetector()
+        for line in f:
+            detector.feed(line)
+            if detector.done:
+                break
+            num_lines -= 1
+            if num_lines == 0:
+                break
+    detector.close()
+    return detector.result['encoding']
+
+
+
+def read_file(file, encoding='utf-8', **kwargs):
+    """
+    Read data from common format file, support .txt, .csv, .pdf, .docx, .json, .dta, etc.
+
+    Args:
+        file (str): file path
+        encoding (str, optional): Defaults to 'utf-8'.
+        **kwargs: other arguments for pd.read_csv, pd.read_excel, pd.read_stata, pd.read_json, pdfdocx.read_pdf, pdfdocx.read_docx
+        
+    Returns:
+        DataFrame
+    """
+    if '.txt' in file or '.TXT' in file:
+        text = open(file, 'r', encoding=encoding, **kwargs).read()
+    elif '.docx' in file or '.DOCX' in file:
+        text = read_docx(file)
+    elif '.pdf' in file or '.PDF' in file:
+        text = read_pdf(file, **kwargs)
+    elif '.xls' in file or '.xlsx' in file:
+        text = pd.read_excel(file,**kwargs)
+    elif '.csv' in file or '.CSV' in file:
+        text = pd.read_csv(file, encoding=encoding, **kwargs)
+    elif '.dta' in file or '.DTA' in file:
+        text = pd.read_stata(file, **kwargs)
+    elif '.json' in file or '.JSON' in file:
+        text = pd.read_json(file, encoding=encoding, **kwargs)
+    else:
+        print('无能为力')
+        text = pd.DataFrame(dict())
+
+    if type(text)!=pd.DataFrame:
+        df = pd.DataFrame({
+            'doc': text,
+            'file': file
+        }, index=[0])
+    else:
+        df = text
+
+    return df
+
+
+
+def get_files(fformat='*.txt', recursive=True):
+    """
+    Get a list of file path in a folder
+
+    Args:
+        fformat (str): filter files, the default value is '*.txt', which means this function only returns a file path list of TXT files. Defaults to '*.txt'. Other options are '*.pdf', '*.docx', '*.csv', '*.xls', '*.xlsx', '*.txt' .If you dont know the subdirectory of the folder, you can use '**/*.txt',
+        '**/*.pdf', '**/*.docx', '**/*.csv', '**/*.xls', '**/*.xlsx', '**/*.txt' 
+        recursive (bool, optional): Whether to recursive search in folder. Defaults to True.
+
+    Returns:
+        a list of file  path
+    """
+    file_list = glob.glob(fformat, recursive=recursive)
+
+    #unify the sep
+    file_list = [file_path.replace('\\', '/') for file_path in file_list]
+    return file_list
+
+
+
+def read_files(fformat='*.*', encoding='utf-8', recursive=True, **kwargs):
+    """
+    Read files from specificed folder path. 
+    
+    Args:
+        fformat (str): filter files, the default value is '*.txt', which means this function only returns a file path list of TXT files. Defaults to '*.txt'. Other options are '*.pdf', '*.docx', '*.csv', '*.xls', '*.xlsx', '*.txt' , '*.dta', '*.json'  .If you dont know the subdirectory of the folder, you can use '*/*.txt',
+        '*/*.pdf', '*/*.docx', '*/*.csv', '*/*.xls', '*/*.xlsx', '*/*.txt' , '*/*.dta' , '*/*.json' 
+    
+        recursive (bool, optional): Whether to recursive search in folder. Defaults to True.
+
+    Returns:
+        DataFrame
+    """
+    dfs = []
+    files = get_files(fformat=fformat, recursive=recursive)
+    for file in files:
+        try:
+            dfs.append(read_file(file, encoding=encoding, **kwargs))
+        except:
+            pass
+    all_df = pd.concat(dfs, axis=0, ignore_index=True)
+    return all_df
diff --git a/build/lib/cntext/mind.py b/build/lib/cntext/mind.py
@@ -0,0 +1,208 @@
+import numpy as np
+from numpy import dot
+from numpy.linalg import norm
+from time import time
+from gensim.models.keyedvectors import KeyedVectors
+import scipy.spatial.distance
+import itertools
+
+class Text2Mind(object):
+    """
+    Calculate cognitive (attitude, bias) direction and strength in text
+    """
+    def __init__(self, w2v_model_path='glove_w2v.6B.100d.txt'):
+        """
+        Init the Text2Mind
+        :param w2v_model_path:  pretained embedding model file path, only support word2vec format pretained model！
+        """
+        print('Loading the model of {}'.format(w2v_model_path))
+        start = time()
+        try:
+            self.model = KeyedVectors.load_word2vec_format(w2v_model_path, binary=False, no_header=True)
+        except:
+            self.model = KeyedVectors.load_word2vec_format(w2v_model_path)
+        duration = round(time()-start, 2)
+        print('Load successfully, used {} s'.format(duration))
+
+    def k2v_model(self, word):
+        """
+        return the KeyedVectors object.
+        """
+        return self.model
+
+
+    def get_vector(self, word):
+        """
+        get word vector
+        """
+        return self.model.get_vector(word)
+
+
+
+    def __get_centroid(self, words):
+        """
+        calculate the centroid vector of multiple word vectors
+        :param words: word list
+        :return:
+        """
+        container = np.zeros(self.model.vector_size)
+        for word in words:
+            try:
+                container = container + self.model.get_vector(word)
+            except:
+                assert "No word in embeddings models"
+        return container / len(words)
+
+
+    def sematic_projection(self, words, c_words1, c_words2):
+        """
+        Calculate the projected length of each word in the concept vector.Note that the calculation result reflects the direction of concept.
+        Greater than 0 means semantically closer to c_words2.
+
+        Refer "Grand, G., Blank, I.A., Pereira, F. and Fedorenko, E., 2022. Semantic projection recovers rich human knowledge of multiple object features from word embeddings. _Nature Human Behaviour_, pp.1-13."
+
+        For example, in the size concept, if you want positive means big, and negative means small,
+        you should set c_words1 = ["small", "little", "tiny"] c_words2 = ["large", "big", "huge"].
+
+        :param words: words list
+        :param c_words1: concept words1, c_words1 = ["small", "little", "tiny"]
+        :param c_words2: concept words2, c_words2 = ["large", "big", "huge"]
+        :param c_vector: concept_vector; the result of .build_concept(c_words1, c_words2)
+        :return:
+        """
+        projection_scores = []
+        #确保词语在向量模型中
+        source_vector = self.__get_centroid(c_words1)
+        target_vector = self.__get_centroid(c_words2)
+        c_vector = target_vector - source_vector
+        concept_norm = norm(c_vector)
+        for word in words:
+            any_vector = self.model.get_vector(word)
+            projection_score = np.dot(any_vector, c_vector) / concept_norm
+            projection_scores.append(projection_score)
+        mean_projection_score = np.mean(projection_scores)
+        mean_projection_score = round(mean_projection_score, 2)
+        return mean_projection_score
+
+
+    def sematic_distance(self, words, c_words1, c_words2):
+        """
+        Calculate the distance from words with c_words1 and c_words2 respectively, and return the difference between the two distance.
+        Greater than 0 means semantically closer to c_words2
+
+        :param words: words list, words = ['program', 'software', 'computer']
+        :param c_words1: concept words1, c_words1 = ["man", "he", "him"]
+        :param c_words2: concept words2, c_words2 = ["woman", "she", "her"]
+        :return:
+        """
+        any_vector = self.__get_centroid(words)
+        c_vector1 = self.__get_centroid(c_words1)
+        c_vector2 = self.__get_centroid(c_words2)
+        dist_1 = np.linalg.norm(any_vector - c_vector1)
+        dist_2 = np.linalg.norm(any_vector - c_vector2)
+        res = dist_1-dist_2
+        return round(res, 2)
+
+
+    def divergent_association_task(words, minimum=7):
+        """Compute DAT score, get the detail of algorithm, please refer to Olson, J. A., Nahas, J., Chmoulevitch, D., Cropper, S. J., & Webb, M. E. (2021). Naming unrelated words predicts creativity. Proceedings of the National Academy of Sciences, 118(25), e2022340118."""
+        # Keep only valid unique words
+        uniques = []
+        for word in words:
+            try:
+                self.model.get_vector(word)
+                uniques.append(word)
+            except:
+                pass
+
+
+        # Keep subset of words
+        if len(uniques) >= minimum:
+            subset = uniques[:minimum]
+        else:
+            return None # Not enough valid words
+
+        # Compute distances between each pair of words
+        distances = []
+        for word1, word2 in itertools.combinations(subset, 2):
+            dist = scipy.spatial.distance.cosine(self.model.get_vector(word1), self.model.get_vector(word2))
+            distances.append(dist)
+
+        # Compute the DAT score (average semantic distance multiplied by 100)
+        return (sum(distances) / len(distances)) * 100
+
+
+
+
+
+# class Alignment(object):
+#     def __init__(self):
+#         pass
+
+
+
+# def procrustes(A, B):
+#     """
+#     Learn the best rotation matrix to align matrix B to A
+#     https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem
+#     """
+#     # U, _, Vt = np.linalg.svd(B.dot(A.T))
+#     U, _, Vt = np.linalg.svd(B.T.dot(A))
+#     return U.dot(Vt)
+
+# def intersect_vocab (idx1, idx2):
+#     """ Intersect the two vocabularies
+
+#     Parameters:
+#     ===========
+#     idx1 (dict): the mapping for vocabulary in the first group
+#     idx2 (dict): the mapping for vocabulary in the second group
+
+#     Returns:
+#     ========
+#     common_idx, common_iidx (tuple): the common mapping for vocabulary in both groups
+#     """
+#     common = idx1.keys() & idx2.keys()
+#     common_vocab = [v for v in common]
+
+#     common_idx, common_iidx = {v:i for i,v in enumerate (common_vocab)}, {i:v for i,v in enumerate (common_vocab)}
+#     return common_vocab, (common_idx, common_iidx)
+
+# def align_matrices (mat1, mat2, idx1, idx2):
+#     """ Align the embedding matrices and their vocabularies.
+
+#     Parameters:
+#     ===========
+#     mat1 (numpy.ndarray): embedding matrix for first group
+#     mat2 (numpy.ndarray): embedding matrix for second group
+
+#     index1 (dict): the mapping dictionary for first group
+#     index2 (dict): the mapping dictionary for the second group
+
+#     Returns:
+#     ========
+#     remapped_mat1 (numpy.ndarray): the aligned matrix for first group
+#     remapped_mat2 (numpy.ndarray): the aligned matrix for second group
+#     common_vocab (tuple): the mapping dictionaries for both the matrices
+#     """  
+#     common_vocab, (common_idx, common_iidx) = intersect_vocab (idx1, idx2)
+#     row_nums1 = [idx1[v] for v in common_vocab]
+#     row_nums2 = [idx2[v] for v in common_vocab]
+
+#     #print (len(common_vocab), len (common_idx), len (common_iidx))
+#     remapped_mat1 = mat1[row_nums1, :]
+#     remapped_mat2 = mat2[row_nums2, :]
+#     #print (mat1.shape, mat2.shape, remapped_mat1.shape, remapped_mat2.shape)
+
+#     omega = procrustes (remapped_mat1, remapped_mat2)
+#     #print (omega.shape)
+#     # rotated_mat2 = np.dot (omega, remapped_mat2)
+#     rotated_mat2 = np.dot (remapped_mat2, omega)
+
+#     return remapped_mat1, rotated_mat2, (common_idx, common_iidx)
+
+
+
+
+
+