1.8.4

hiDaDeng · Mar 9, 2023 · 4f305cc · 4f305cc
1 parent 4770ffe
commit 4f305cc
Show file tree

Hide file tree

Showing 11 changed files with 10 additions and 14 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/build/lib/cntext/__init__.py b/build/lib/cntext/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.8.2"
+__version__ = "1.8.4"
 
 from cntext.dictionary import SoPmi, W2VModels, co_occurrence_matrix, Glove
 from cntext.similarity import jaccard_sim, minedit_sim, simple_sim, cosine_sim

diff --git a/build/lib/cntext/dictionary.py b/build/lib/cntext/dictionary.py
@@ -266,12 +266,13 @@ def __preproces(self, documents):
 
 
 
-    def train(self, input_txt_file, model_name='w2v.model', vector_size=100, min_count=5, ngram=False):
+    def train(self, input_txt_file, model_name='w2v.model', vector_size=100, window_size=6, min_count=5, ngram=False):
         """
         train word2vec model for corpus
         :param input_txt_file:  corpus file path
         :param model_name:  used as model name(save)
         :param vector_size: dimensionality of the word vectors.
+        :param window_size: window size for word2vec
         :param min_count: Set the word to appear at least min_count times in the model
         :param ngram: whether to take the ngram case into account，default False
         :return:
@@ -298,7 +299,7 @@ def train(self, input_txt_file, model_name='w2v.model', vector_size=100, min_cou
             sentences = sents
 
         print('Step 2/4:...Train  word2vec model\n            used   {} s'.format(duration))
-        self.model = word2vec.Word2Vec(sentences, vector_size=vector_size, min_count=min_count, workers=multiprocessing.cpu_count())
+        self.model = word2vec.Word2Vec(sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=multiprocessing.cpu_count())
         modeldir = Path(self.cwd).joinpath('output', 'w2v_candi_words')
         Path(self.cwd).joinpath('output').mkdir(exist_ok=True)
         Path(self.cwd).joinpath('output', 'w2v_candi_words').mkdir(exist_ok=True)
@@ -373,7 +374,6 @@ def find(self, seedword_txt_file, topn=50):
 
 
 
-
 def co_occurrence_matrix(documents, window_size=2, lang='chinese'):
     """
     Build a co-word matrix

diff --git a/build/lib/cntext/mind.py b/build/lib/cntext/mind.py
@@ -134,8 +134,6 @@ def divergent_association_task(words, minimum=7):
 
 
 
-
-
 # class Alignment(object):
 #     def __init__(self):
 #         pass

diff --git a/cntext.egg-info/PKG-INFO b/cntext.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cntext
-Version: 1.8.3
+Version: 1.8.4
 Summary: Chinese text analysis library, which can perform word frequency statistics, dictionary expansion, sentiment analysis, similarity, readability, co-occurrence analysis, social calculation (attitude, prejudice, culture) on texts
 Home-page: https://github.com/hidadeng/cntext
 Author: 大邓

diff --git a/cntext/.DS_Store b/cntext/.DS_Store
diff --git a/cntext/__init__.py b/cntext/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.8.2"
+__version__ = "1.8.4"
 
 from cntext.dictionary import SoPmi, W2VModels, co_occurrence_matrix, Glove
 from cntext.similarity import jaccard_sim, minedit_sim, simple_sim, cosine_sim

diff --git a/cntext/dictionary.py b/cntext/dictionary.py
@@ -266,12 +266,13 @@ def __preproces(self, documents):
 
 
 
-    def train(self, input_txt_file, model_name='w2v.model', vector_size=100, min_count=5, ngram=False):
+    def train(self, input_txt_file, model_name='w2v.model', vector_size=100, window_size=6, min_count=5, ngram=False):
         """
         train word2vec model for corpus
         :param input_txt_file:  corpus file path
         :param model_name:  used as model name(save)
         :param vector_size: dimensionality of the word vectors.
+        :param window_size: window size for word2vec
         :param min_count: Set the word to appear at least min_count times in the model
         :param ngram: whether to take the ngram case into account，default False
         :return:
@@ -298,7 +299,7 @@ def train(self, input_txt_file, model_name='w2v.model', vector_size=100, min_cou
             sentences = sents
 
         print('Step 2/4:...Train  word2vec model\n            used   {} s'.format(duration))
-        self.model = word2vec.Word2Vec(sentences, vector_size=vector_size, min_count=min_count, workers=multiprocessing.cpu_count())
+        self.model = word2vec.Word2Vec(sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=multiprocessing.cpu_count())
         modeldir = Path(self.cwd).joinpath('output', 'w2v_candi_words')
         Path(self.cwd).joinpath('output').mkdir(exist_ok=True)
         Path(self.cwd).joinpath('output', 'w2v_candi_words').mkdir(exist_ok=True)
@@ -373,7 +374,6 @@ def find(self, seedword_txt_file, topn=50):
 
 
 
-
 def co_occurrence_matrix(documents, window_size=2, lang='chinese'):
     """
     Build a co-word matrix

diff --git a/cntext/mind.py b/cntext/mind.py
@@ -134,8 +134,6 @@ def divergent_association_task(words, minimum=7):
 
 
 
-
-
 # class Alignment(object):
 #     def __init__(self):
 #         pass

diff --git a/dist/cntext-1.8.3-py3-none-any.whl → dist/cntext-1.8.4-py3-none-any.whl b/dist/cntext-1.8.3-py3-none-any.whl → dist/cntext-1.8.4-py3-none-any.whl
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name='cntext',     # 包名字
-    version='1.8.3',   # 包版本
+    version='1.8.4',   # 包版本
     description='Chinese text analysis library, which can perform word frequency statistics, dictionary expansion, sentiment analysis, similarity, readability, co-occurrence analysis, social calculation (attitude, prejudice, culture) on texts',   # 简单描述
     author='大邓',  # 作者
     author_email='thunderhit@qq.com',  # 邮箱