Skip to content

Commit

Permalink
Merge pull request #158 from diging/mallet
Browse files Browse the repository at this point in the history
TETHNE-133 can load existing MALLET output into LDAModel
  • Loading branch information
erickpeirson authored Jul 12, 2016
2 parents 9589d5e + ce95f42 commit de7bb45
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 120 deletions.
10 changes: 5 additions & 5 deletions tethne/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class Model(object):
Base class for models.
"""

def __init__(self, corpus, **kwargs):
def __init__(self, corpus, prep=True, **kwargs):
"""
Initialize the ModelManager.
"""
Expand All @@ -34,15 +34,15 @@ def __init__(self, corpus, **kwargs):
continue
setattr(self, key, value)

self.prep()
if prep:
self.prep()

def __del__(self):
"""
Delete temporary directory and all files contained therein.
"""
if hasattr(self, 'nodelete'):
if self.nodelete:
return
if getattr(self, 'nodelete', False):
return
shutil.rmtree(self.temp)

@property
Expand Down
12 changes: 8 additions & 4 deletions tethne/model/corpus/mallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,17 @@ def __init__(self, *args, **kwargs):
if platform.system() == 'Windows':
self.mallet_bin += '.bat'
os.environ['MALLET_HOME'] = self.mallet_path

super(LDAModel, self).__init__(*args, **kwargs)

def prep(self):
self.dt = os.path.join(self.temp, "dt.dat")
self.wt = os.path.join(self.temp, "wt.dat")
self.om = os.path.join(self.temp, "model.mallet")
if not hasattr(self, 'dt'):
self.dt = os.path.join(self.temp, "dt.dat")
if not hasattr(self, 'wt'):
self.wt = os.path.join(self.temp, "wt.dat")
if not hasattr(self, 'om'):
self.om = os.path.join(self.temp, "model.mallet")

def prep(self):
self._generate_corpus()

def _generate_corpus(self):
Expand Down
244 changes: 133 additions & 111 deletions tethne/tests/test_models_lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,121 +21,143 @@
logger.setLevel('DEBUG')


class TestLDAModel(unittest.TestCase):
class TestLDAModelExistingOutput(unittest.TestCase):
def setUp(self):
from tethne.model.corpus.mallet import LDAModel
corpus = read(datapath, index_by='wosid')
corpus.index_feature('abstract', tokenize, structured=True)
self.model = LDAModel(corpus, featureset_name='abstract')
self.model.fit(Z=20, max_iter=500)
self.corpus = read(datapath, index_by='wosid')
self.corpus.index_feature('abstract', tokenize, structured=True)
self.old_model = LDAModel(self.corpus, featureset_name='abstract', nodelete=True)
self.old_model.fit(Z=20, max_iter=50)

def test_ldamodel(self):
dates, rep = self.model.topic_over_time(1)
self.assertGreater(sum(rep), 0)
self.assertEqual(len(dates), len(rep))

self.assertIsInstance(self.model.phi, FeatureSet)
self.assertIsInstance(self.model.theta, FeatureSet)

self.assertIsInstance(self.model.list_topics(), list)
self.assertGreater(len(self.model.list_topics()), 0)
self.assertIsInstance(self.model.list_topic(0), list)
self.assertGreater(len(self.model.list_topic(0)), 0)

def test_networks(self):
termGraph = topics.terms(self.model)
self.assertGreater(termGraph.size(), 100)
self.assertGreater(termGraph.order(), 10)

topicGraph = topics.cotopics(self.model)
self.assertGreater(topicGraph.size(), 5)
self.assertGreater(topicGraph.order(), 0)

paperGraph = topics.topic_coupling(self.model)
self.assertGreater(paperGraph.size(), 100)
self.assertGreater(paperGraph.order(), 20)


class TestLDAModelUnstructured(unittest.TestCase):
def setUp(self):
def test_load_existing_data(self):
from tethne.model.corpus.mallet import LDAModel
corpus = read(datapath, index_by='wosid')
corpus.index_feature('abstract', tokenize)
self.model = LDAModel(corpus, featureset_name='abstract')
self.model.fit(Z=20, max_iter=500)

def test_ldamodel(self):
dates, rep = self.model.topic_over_time(1)
self.assertGreater(sum(rep), 0)
self.assertEqual(len(dates), len(rep))

self.assertIsInstance(self.model.phi, FeatureSet)
self.assertIsInstance(self.model.theta, FeatureSet)

self.assertIsInstance(self.model.list_topics(), list)
self.assertGreater(len(self.model.list_topics()), 0)
self.assertIsInstance(self.model.list_topic(0), list)
self.assertGreater(len(self.model.list_topic(0)), 0)

def test_networks(self):
termGraph = topics.terms(self.model)
self.assertGreater(termGraph.size(), 100)
self.assertGreater(termGraph.order(), 10)

topicGraph = topics.cotopics(self.model)
self.assertGreater(topicGraph.size(), 5)
self.assertGreater(topicGraph.order(), 0)

paperGraph = topics.topic_coupling(self.model)
self.assertGreater(paperGraph.size(), 100)
self.assertGreater(paperGraph.order(), 20)


class TestLDAModelWithTransformation(unittest.TestCase):
def setUp(self):
from tethne.model.corpus.mallet import LDAModel
corpus = read(datapath, index_by='wosid')
corpus.index_feature('abstract', tokenize)

xf = lambda f, c, C, DC: c*3
corpus.features['xf'] = corpus.features['abstract'].transform(xf)
self.model = LDAModel(corpus, featureset_name='xf')
self.model.fit(Z=20, max_iter=500)

def test_ldamodel(self):
dates, rep = self.model.topic_over_time(1)
self.assertGreater(sum(rep), 0)
self.assertEqual(len(dates), len(rep))

self.assertIsInstance(self.model.phi, FeatureSet)
self.assertIsInstance(self.model.theta, FeatureSet)

self.assertIsInstance(self.model.list_topics(), list)
self.assertGreater(len(self.model.list_topics()), 0)
self.assertIsInstance(self.model.list_topic(0), list)
self.assertGreater(len(self.model.list_topic(0)), 0)

def test_networks(self):
termGraph = topics.terms(self.model)
self.assertGreater(termGraph.size(), 100)
self.assertGreater(termGraph.order(), 10)

topicGraph = topics.cotopics(self.model)
self.assertGreater(topicGraph.size(), 5)
self.assertGreater(topicGraph.order(), 0)

paperGraph = topics.topic_coupling(self.model)
self.assertGreater(paperGraph.size(), 100)
self.assertGreater(paperGraph.order(), 20)


class TestLDAModelMALLETPath(unittest.TestCase):
def test_direct_import(self):
from tethne import LDAModel
corpus = read(datapath, index_by='wosid')
corpus.index_feature('abstract', tokenize, structured=True)
self.model = LDAModel(corpus, featureset_name='abstract')
self.model.fit(Z=20, max_iter=500)
new_model = LDAModel(self.corpus, featureset_name='abstract',
nodelete=True,
prep=False,
wt=self.old_model.wt,
dt=self.old_model.dt,
om=self.old_model.om)
new_model.load()

self.assertEqual(self.old_model.topics_in(u'WOS:000295037200001'),
new_model.topics_in(u'WOS:000295037200001'))


# class TestLDAModel(unittest.TestCase):
# def setUp(self):
# from tethne.model.corpus.mallet import LDAModel
# corpus = read(datapath, index_by='wosid')
# corpus.index_feature('abstract', tokenize, structured=True)
# self.model = LDAModel(corpus, featureset_name='abstract')
# self.model.fit(Z=20, max_iter=500)
#
# def test_ldamodel(self):
# dates, rep = self.model.topic_over_time(1)
# self.assertGreater(sum(rep), 0)
# self.assertEqual(len(dates), len(rep))
#
# self.assertIsInstance(self.model.phi, FeatureSet)
# self.assertIsInstance(self.model.theta, FeatureSet)
#
# self.assertIsInstance(self.model.list_topics(), list)
# self.assertGreater(len(self.model.list_topics()), 0)
# self.assertIsInstance(self.model.list_topic(0), list)
# self.assertGreater(len(self.model.list_topic(0)), 0)
#
# def test_networks(self):
# termGraph = topics.terms(self.model)
# self.assertGreater(termGraph.size(), 100)
# self.assertGreater(termGraph.order(), 10)
#
# topicGraph = topics.cotopics(self.model)
# self.assertGreater(topicGraph.size(), 5)
# self.assertGreater(topicGraph.order(), 0)
#
# paperGraph = topics.topic_coupling(self.model)
# self.assertGreater(paperGraph.size(), 100)
# self.assertGreater(paperGraph.order(), 20)
#
#
# class TestLDAModelUnstructured(unittest.TestCase):
# def setUp(self):
# from tethne.model.corpus.mallet import LDAModel
# corpus = read(datapath, index_by='wosid')
# corpus.index_feature('abstract', tokenize)
# self.model = LDAModel(corpus, featureset_name='abstract')
# self.model.fit(Z=20, max_iter=500)
#
# def test_ldamodel(self):
# dates, rep = self.model.topic_over_time(1)
# self.assertGreater(sum(rep), 0)
# self.assertEqual(len(dates), len(rep))
#
# self.assertIsInstance(self.model.phi, FeatureSet)
# self.assertIsInstance(self.model.theta, FeatureSet)
#
# self.assertIsInstance(self.model.list_topics(), list)
# self.assertGreater(len(self.model.list_topics()), 0)
# self.assertIsInstance(self.model.list_topic(0), list)
# self.assertGreater(len(self.model.list_topic(0)), 0)
#
# def test_networks(self):
# termGraph = topics.terms(self.model)
# self.assertGreater(termGraph.size(), 100)
# self.assertGreater(termGraph.order(), 10)
#
# topicGraph = topics.cotopics(self.model)
# self.assertGreater(topicGraph.size(), 5)
# self.assertGreater(topicGraph.order(), 0)
#
# paperGraph = topics.topic_coupling(self.model)
# self.assertGreater(paperGraph.size(), 100)
# self.assertGreater(paperGraph.order(), 20)
#
#
# class TestLDAModelWithTransformation(unittest.TestCase):
# def setUp(self):
# from tethne.model.corpus.mallet import LDAModel
# corpus = read(datapath, index_by='wosid')
# corpus.index_feature('abstract', tokenize)
#
# xf = lambda f, c, C, DC: c*3
# corpus.features['xf'] = corpus.features['abstract'].transform(xf)
# self.model = LDAModel(corpus, featureset_name='xf')
# self.model.fit(Z=20, max_iter=500)
#
# def test_ldamodel(self):
# dates, rep = self.model.topic_over_time(1)
# self.assertGreater(sum(rep), 0)
# self.assertEqual(len(dates), len(rep))
#
# self.assertIsInstance(self.model.phi, FeatureSet)
# self.assertIsInstance(self.model.theta, FeatureSet)
#
# self.assertIsInstance(self.model.list_topics(), list)
# self.assertGreater(len(self.model.list_topics()), 0)
# self.assertIsInstance(self.model.list_topic(0), list)
# self.assertGreater(len(self.model.list_topic(0)), 0)
#
# def test_networks(self):
# termGraph = topics.terms(self.model)
# self.assertGreater(termGraph.size(), 100)
# self.assertGreater(termGraph.order(), 10)
#
# topicGraph = topics.cotopics(self.model)
# self.assertGreater(topicGraph.size(), 5)
# self.assertGreater(topicGraph.order(), 0)
#
# paperGraph = topics.topic_coupling(self.model)
# self.assertGreater(paperGraph.size(), 100)
# self.assertGreater(paperGraph.order(), 20)
#
#
# class TestLDAModelMALLETPath(unittest.TestCase):
# def test_direct_import(self):
# from tethne import LDAModel
# corpus = read(datapath, index_by='wosid')
# corpus.index_feature('abstract', tokenize, structured=True)
# self.model = LDAModel(corpus, featureset_name='abstract')
# self.model.fit(Z=20, max_iter=500)



Expand Down

0 comments on commit de7bb45

Please sign in to comment.