Skip to content

Commit

Permalink
1.8.4
Browse files Browse the repository at this point in the history
  • Loading branch information
hiDaDeng committed Mar 9, 2023
1 parent 4770ffe commit 4f305cc
Show file tree
Hide file tree
Showing 11 changed files with 10 additions and 14 deletions.
Binary file modified .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion build/lib/cntext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.8.2"
__version__ = "1.8.4"

from cntext.dictionary import SoPmi, W2VModels, co_occurrence_matrix, Glove
from cntext.similarity import jaccard_sim, minedit_sim, simple_sim, cosine_sim
Expand Down
6 changes: 3 additions & 3 deletions build/lib/cntext/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,13 @@ def __preproces(self, documents):



def train(self, input_txt_file, model_name='w2v.model', vector_size=100, min_count=5, ngram=False):
def train(self, input_txt_file, model_name='w2v.model', vector_size=100, window_size=6, min_count=5, ngram=False):
"""
train word2vec model for corpus
:param input_txt_file: corpus file path
:param model_name: used as model name(save)
:param vector_size: dimensionality of the word vectors.
:param window_size: window size for word2vec
:param min_count: Set the word to appear at least min_count times in the model
:param ngram: whether to take the ngram case into account,default False
:return:
Expand All @@ -298,7 +299,7 @@ def train(self, input_txt_file, model_name='w2v.model', vector_size=100, min_cou
sentences = sents

print('Step 2/4:...Train word2vec model\n used {} s'.format(duration))
self.model = word2vec.Word2Vec(sentences, vector_size=vector_size, min_count=min_count, workers=multiprocessing.cpu_count())
self.model = word2vec.Word2Vec(sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=multiprocessing.cpu_count())
modeldir = Path(self.cwd).joinpath('output', 'w2v_candi_words')
Path(self.cwd).joinpath('output').mkdir(exist_ok=True)
Path(self.cwd).joinpath('output', 'w2v_candi_words').mkdir(exist_ok=True)
Expand Down Expand Up @@ -373,7 +374,6 @@ def find(self, seedword_txt_file, topn=50):




def co_occurrence_matrix(documents, window_size=2, lang='chinese'):
"""
Build a co-word matrix
Expand Down
2 changes: 0 additions & 2 deletions build/lib/cntext/mind.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ def divergent_association_task(words, minimum=7):





# class Alignment(object):
# def __init__(self):
# pass
Expand Down
2 changes: 1 addition & 1 deletion cntext.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: cntext
Version: 1.8.3
Version: 1.8.4
Summary: Chinese text analysis library, which can perform word frequency statistics, dictionary expansion, sentiment analysis, similarity, readability, co-occurrence analysis, social calculation (attitude, prejudice, culture) on texts
Home-page: https://github.com/hidadeng/cntext
Author: 大邓
Expand Down
Binary file added cntext/.DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion cntext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.8.2"
__version__ = "1.8.4"

from cntext.dictionary import SoPmi, W2VModels, co_occurrence_matrix, Glove
from cntext.similarity import jaccard_sim, minedit_sim, simple_sim, cosine_sim
Expand Down
6 changes: 3 additions & 3 deletions cntext/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,13 @@ def __preproces(self, documents):



def train(self, input_txt_file, model_name='w2v.model', vector_size=100, min_count=5, ngram=False):
def train(self, input_txt_file, model_name='w2v.model', vector_size=100, window_size=6, min_count=5, ngram=False):
"""
train word2vec model for corpus
:param input_txt_file: corpus file path
:param model_name: used as model name(save)
:param vector_size: dimensionality of the word vectors.
:param window_size: window size for word2vec
:param min_count: Set the word to appear at least min_count times in the model
:param ngram: whether to take the ngram case into account,default False
:return:
Expand All @@ -298,7 +299,7 @@ def train(self, input_txt_file, model_name='w2v.model', vector_size=100, min_cou
sentences = sents

print('Step 2/4:...Train word2vec model\n used {} s'.format(duration))
self.model = word2vec.Word2Vec(sentences, vector_size=vector_size, min_count=min_count, workers=multiprocessing.cpu_count())
self.model = word2vec.Word2Vec(sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=multiprocessing.cpu_count())
modeldir = Path(self.cwd).joinpath('output', 'w2v_candi_words')
Path(self.cwd).joinpath('output').mkdir(exist_ok=True)
Path(self.cwd).joinpath('output', 'w2v_candi_words').mkdir(exist_ok=True)
Expand Down Expand Up @@ -373,7 +374,6 @@ def find(self, seedword_txt_file, topn=50):




def co_occurrence_matrix(documents, window_size=2, lang='chinese'):
"""
Build a co-word matrix
Expand Down
2 changes: 0 additions & 2 deletions cntext/mind.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ def divergent_association_task(words, minimum=7):





# class Alignment(object):
# def __init__(self):
# pass
Expand Down
Binary file not shown.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name='cntext', # 包名字
version='1.8.3', # 包版本
version='1.8.4', # 包版本
description='Chinese text analysis library, which can perform word frequency statistics, dictionary expansion, sentiment analysis, similarity, readability, co-occurrence analysis, social calculation (attitude, prejudice, culture) on texts', # 简单描述
author='大邓', # 作者
author_email='thunderhit@qq.com', # 邮箱
Expand Down

0 comments on commit 4f305cc

Please sign in to comment.