From 832597515ba0d731c93b1f59cb1044f4ef8dd9b5 Mon Sep 17 00:00:00 2001 From: Erick Peirson Date: Mon, 11 Jul 2016 18:40:14 -0400 Subject: [PATCH] bringing DTM back #147 TETHNE-130 --- setup.py | 1 + tethne/model/corpus/dtm.py | 269 +++++++++++++++++++++++--------- tethne/tests/test_models_dtm.py | 60 +++---- 3 files changed, 224 insertions(+), 106 deletions(-) diff --git a/setup.py b/setup.py index 82c8cc2b..fcdd2a2d 100644 --- a/setup.py +++ b/setup.py @@ -51,5 +51,6 @@ "slate", "Unidecode==0.4.17", "nltk", + "numpy==1.9.3" ], ) diff --git a/tethne/model/corpus/dtm.py b/tethne/model/corpus/dtm.py index 4c773daf..3de6b64b 100644 --- a/tethne/model/corpus/dtm.py +++ b/tethne/model/corpus/dtm.py @@ -5,13 +5,19 @@ from tethne.model import Model import os, sys, re, shutil, tempfile, subprocess, csv, platform, inspect +try: + import numpy as np +except ImportError: + raise ImportError('DTMModel requires Numpy') + from collections import defaultdict TETHNE_PATH = os.path.join(os.path.dirname(os.path.abspath(inspect.stack()[0][1])), '..', '..') DTM_PATH = os.path.join(TETHNE_PATH, 'bin', 'dtm') -def _to_dtm_input(corpus, target, featureset_name, fields=['date','atitle'], **slice_kwargs): +def _to_dtm_input(corpus, target, featureset_name, fields=['date','atitle'], + **slice_kwargs): """ Parameters @@ -99,10 +105,12 @@ def _to_dtm_input(corpus, target, featureset_name, fields=['date','atitle'], **s # ... # number_docs_time_NumberTimestamps # + years = [] with open(target + '-seq.dat', 'wb') as seqFile: seqFile.write(str(len(seq)) + '\n') for year, papers in sorted(seq.items()): seqFile.write('{0}\n'.format(len(papers))) + years.append(year) # a file with all of the words in the vocabulary, arranged in # the same order as the word indices @@ -110,7 +118,7 @@ def _to_dtm_input(corpus, target, featureset_name, fields=['date','atitle'], **s for index, word in sorted(vocab.items()): vocabFile.write('{0}\n'.format(word)) - return len(seq) + return years, len(seq) class DTMModel(Model): @@ -180,54 +188,36 @@ def run(self, **kwargs): while p.poll() is None: l = p.stderr.readline() - print l - try: # Find the LL - this_ll = float(re.findall(r'^lhood\s+=\s+([-]?\d+\.\d+)', l)[0]) - self.ll.append(this_ll) - + match = re.match(r'^lhood\s+=\s+([-]?\d+\.\d+)', l) + if match: # Find the LL + self.ll.append(match.groups()[0]) self.ll_iters.append(i) i += 1 - except IndexError: - pass - - try: # Find conv - conv = re.findall(r'conv\s+=\s+([-]?\d+\.\d+e[-]\d+)', l) - self.conv.append(float(conv[0])) - progress = int(100 * float(len(self.conv))/float(max_v)) - - except IndexError: - pass self.load() - # self.max_iter += lda_max_em_iter # TODO: does this make sense? - - def _generate_corpus(self, **slice_kwargs): """ Writes a corpus to disk amenable to DTM. """ - self.N = _to_dtm_input(self.corpus, self.temp+'/tethne', featureset_name=self.featureset_name) + self.years, self.N = _to_dtm_input(self.corpus, self.temp+'/tethne', + featureset_name=self.featureset_name, + **slice_kwargs) def load(self): """Load and return a :class:`.DTMModel`\.""" - self.e_theta, self.phi, self.metadata, self.vocabulary = from_gerrish(self.outname, self.meta_path, self.vocab_path) + result = from_gerrish(self.outname, self.meta_path, self.vocab_path) + self.e_theta, self.phi, self.metadata, self.vocabulary = result - self.Z = e_theta.shape[0] # Number of topics. - self.M = e_theta.shape[1] # Number of documents. - - self.W = phi.shape[1] # Number of words. - self.T = phi.shape[2] # Number of time periods. - - self.lookup = { v['id']:k for k,v in metadata.iteritems() } - - logging.debug('DTMModel.__init__(): loaded model with' + \ - ' {0} topics, {1} documents,'.format(self.Z, self.M) + \ - ' {0} words, {1} time periods.'.format(self.W, self.T)) + self.Z = self.e_theta.shape[0] # Number of topics. + self.M = self.e_theta.shape[1] # Number of documents. + self.W = self.phi.shape[1] # Number of words. + self.T = self.phi.shape[2] # Number of time periods. + self.lookup = {v['id']:k for k,v in self.metadata.iteritems()} def _item_description(self, i, **kwargs): """ @@ -269,8 +259,6 @@ def _dimension_items(self, k, threshold, **kwargs): def topic_evolution(self, k, Nwords=5): """ - Generate a plot that shows p(w|z) over time for the top ``Nwords`` - terms. Parameters ---------- @@ -288,25 +276,19 @@ def topic_evolution(self, k, Nwords=5): """ t_keys = range(self.T) - t_values = {} + t_values = defaultdict(dict) for t in t_keys: dim = self.dimension(k, t=t, top=Nwords) - for w,p in dim: - if w not in t_values: - t_values[w] = {} + for w, p in dim: t_values[w][t] = p - t_series = {} + t_series = defaultdict(list) for w, values in t_values.iteritems(): word = self.vocabulary[w] - series = [] for t in t_keys: - if t in values: - series.append(values[t]) - else: # No value for that time-period. - series.append(0.) - t_series[word] = series + t_series[word].append(values[t] if t in values else 0.) + t_keys = getattr(self, 'years', t_keys) return t_keys, t_series def list_topic(self, k, t, Nwords=10): @@ -327,15 +309,12 @@ def list_topic(self, k, t, Nwords=10): as_list : list List of words in topic. """ - words = self.dimension(k, t=t, top=Nwords) - as_list = [ self.vocabulary[w] for w,p in words ] - return as_list + words = self.dimension(k, t=t, top=Nwords) + return [self.vocabulary[w] for w, p in words] def list_topic_diachronic(self, k, Nwords=10): - as_dict = { t:self.list_topic(k, t, Nwords) - for t in xrange(self.T) } - return as_dict + return {t: self.list_topic(k, t, Nwords) for t in xrange(self.T)} def print_topic_diachronic(self, k, Nwords=10): as_dict = self.list_topic_diachronic(k, Nwords) @@ -359,15 +338,9 @@ def print_topic(self, k, t, Nwords=10): Nwords : int Number of words to return. - Returns - ------- - as_string : str - Joined list of words in topic. """ - as_string = ', '.join(self.list_topic(k, t=t, Nwords=Nwords)) - - print as_string + print u', '.join(self.list_topic(k, t=t, Nwords=Nwords)) def list_topics(self, t, Nwords=10): """ @@ -382,15 +355,11 @@ def list_topics(self, t, Nwords=10): Returns ------- - as_dict : dict + dict Keys are topic indices, values are list of words. """ - as_dict = {} - for k in xrange(self.Z): - as_dict[k] = self.list_topic(k, t, Nwords) - - return as_dict + return {k: self.list_topic(k, t, Nwords) for k in xrange(self.Z)} def print_topics(self, t, Nwords=10): """ @@ -409,13 +378,169 @@ def print_topics(self, t, Nwords=10): Newline-delimited lists of words for each topic. """ - as_dict = self.list_topics(t, Nwords) - s = [] - for key, value in as_dict.iteritems(): - s.append('{0}: {1}'.format(key, ', '.join(value))) - as_string = '\n'.join(s) - print as_string + print u'\n'.join([u'{0}: {1}'.format(key, u', '.join(value)) + for key, value + in self.list_topics(t, Nwords).iteritems()]) + + def item(self, i, top=None, **kwargs): + """ + Describes an item in terms of dimensions and weights. + + Subclass must provide ``_item_description(i)`` method. + + Parameters + ---------- + i : int + Index for an item. + top : int + (optional) Number of (highest-w) dimensions to return. + + Returns + ------- + description : list + A list of ( dimension , weight ) tuples. + """ + + try: + description = self._item_description(i, **kwargs) + except KeyError: + raise KeyError('No such item index in this model.') + except AttributeError: + raise NotImplementedError('_item_description() not implemented' + \ + ' for this model class.') + + # Optionally, select only the top-weighted dimensions. + if type(top) is int: + D, W = zip(*description) # Dimensions and Weights. + D = list(D) # To support element deletion, below. + W = list(W) + top_description = [] + while len(top_description) < top: # Avoiding Numpy argsort. + d = W.index(max(W)) # Index of top weight. + top_description.append((D[d], W[d])) + del D[d], W[d] + return top_description + return description + + def item_relationship(self, i, j, **kwargs): + """ + Describes the relationship between two items. + + Subclass must provide ``_item_relationship(i, j)`` method. + + Parameters + ---------- + i : int + Item index. + j : int + Item index. + + Returns + ------- + list + A list of ( dimension , weight ) tuples. + """ + + try: + return self._item_relationship(i, j, **kwargs) + except AttributeError: + raise NotImplementedError('_item_relationship() not implemented' \ + + ' for this model class.') + + def dimension(self, d, top=None, asmatrix=False, **kwargs): + """ + Describes a dimension (eg a topic). + + Subclass must provide ``_dimension_description(d)`` method. + + Parameters + ---------- + d : int + Dimension index. + + Returns + ------- + description : list + A list of ( feature, weight ) tuples (e.g. word, prob ). + """ + + try: + description = self._dimension_description(d, **kwargs) + except AttributeError: + raise NotImplementedError('_dimension_description() not' + \ + ' implemented for this model class.') + + # Optionally, select only the top-weighted dimensions. + if type(top) is int: + D, W = zip(*description) # Dimensions and Weights. + D = list(D) # To support element deletion, below. + W = list(W) + top_description = [] + while len(top_description) < top: # Avoiding Numpy argsort. + d = W.index(max(W)) # Index of top weight. + top_description.append((D[d], W[d])) + del D[d], W[d] + + description = top_description + + if asmatrix: + J,K = zip(*description) + I = [ d for i in xrange(len(J)) ] + mat = coo_matrix(list(K), (I,list(J))).tocsc() + return mat + + return description + + def dimension_items(self, d, threshold, **kwargs): + """ + Describes a dimension in terms of the items that contain it. + + Subclass must provide ``_dimension_items(d, threshold)`` method. + + Parameters + ---------- + d : int + Dimension index. + threshold : float + Minimum representation of ``d`` in item. + + Returns + ------- + description : list + A list of ( item, weight ) tuples. + """ + + try: + return self._dimension_items(d, threshold, **kwargs) + except AttributeError: + raise NotImplementedError('_dimension_items() not implemented for' \ + + ' this model class.') + + def dimension_relationship(self, d, e, **kwargs): + """ + Describes the relationship between two dimensions. + + Subclass must provide ``_dimension_relationship(d, e)`` method. + + Parameters + ---------- + d : int + Dimension index. + e : int + Dimension index. + + Returns + ------- + relationship : list + A list of ( factor , weight ) tuples. + """ + + try: + return self._dimension_relationship(d, e, **kwargs) + except AttributeError: + raise NotImplementedError('_dimension_relationship() not' \ + + ' implemented for this model class.') @@ -517,7 +642,7 @@ def load(self): self.handler[fs[-2]](fname, z) tkeys = sorted(self.tdict.keys()) - self.phi = np.array( [ self.tdict[z] for z in tkeys ]) + self.phi = np.array([self.tdict[z] for z in tkeys]) return self.e_theta, self.phi, self.metadata, self.vocabulary diff --git a/tethne/tests/test_models_dtm.py b/tethne/tests/test_models_dtm.py index d76d2d22..5cbfc5da 100644 --- a/tethne/tests/test_models_dtm.py +++ b/tethne/tests/test_models_dtm.py @@ -26,16 +26,20 @@ from tethne.model.corpus.dtm import _to_dtm_input +def _cleanUp(basepath): + for fname in ['meta', 'mult', 'seq', 'vocab']: + try: + os.remove(basepath + '-%s.dat' % fname) + except OSError: + pass + + class TestToDTMInput(unittest.TestCase): def setUp(self): self.corpus = read(datapath, index_by='wosid') self.basepath = os.path.join(sandbox, 'dtm_test') self.corpus.index_feature('abstract', word_tokenize) - for fname in ['meta', 'mult', 'seq', 'vocab']: - try: - os.remove(self.basepath + '-%s.dat' % fname) - except OSError: - pass + _cleanUp(self.basepath) def test_to_dtm_input(self): _to_dtm_input(self.corpus, self.basepath, 'abstract') @@ -44,8 +48,7 @@ def test_to_dtm_input(self): self.assertTrue(os.path.exists(self.basepath + '-%s.dat' % fname)) def tearDown(self): - for fname in ['meta', 'mult', 'seq', 'vocab']: - os.remove(self.basepath + '-%s.dat' % fname) + _cleanUp(self.basepath) class TestDTMModel(unittest.TestCase): @@ -53,39 +56,28 @@ def setUp(self): self.corpus = read(datapath, index_by='wosid') self.basepath = os.path.join(sandbox, 'dtm_test') self.corpus.index_feature('abstract', word_tokenize) - for fname in ['meta', 'mult', 'seq', 'vocab']: - try: - os.remove(self.basepath + '-%s.dat' % fname) - except OSError: - pass + _cleanUp(self.basepath) def test_init(self): self.model = DTMModel(self.corpus, featureset_name='abstract') def test_fit(self): self.model = DTMModel(self.corpus, featureset_name='abstract') - self.model.fit(Z=20) - -# class TestLDAModel(unittest.TestCase): -# def setUp(self): -# from tethne.model.corpus.dtm import DTMModel -# corpus = read(datapath, index_by='wosid') -# corpus.index_feature('abstract', tokenize, structured=True) -# self.model = LDAModel(corpus, featureset_name='abstract') -# self.model.fit(Z=20, max_iter=500) -# -# def test_ldamodel(self): -# dates, rep = self.model.topic_over_time(1) -# self.assertGreater(sum(rep), 0) -# self.assertEqual(len(dates), len(rep)) -# -# self.assertIsInstance(self.model.phi, FeatureSet) -# self.assertIsInstance(self.model.theta, FeatureSet) -# -# self.assertIsInstance(self.model.list_topics(), list) -# self.assertGreater(len(self.model.list_topics()), 0) -# self.assertIsInstance(self.model.list_topic(0), list) -# self.assertGreater(len(self.model.list_topic(0)), 0) + self.model.fit(Z=5) + + self.assertEqual(self.model.e_theta.shape, (5, 220)) + self.assertEqual(self.model.phi.shape, (5, 7429, 12)) + + keys, values = self.model.topic_evolution(0) + self.assertEqual(keys, self.corpus.indices['date'].keys()) + + self.model.list_topic(0, 0) + self.model.list_topic_diachronic(0) + self.model.list_topics(0) + + def tearDown(self): + _cleanUp(self.basepath) + if __name__ == '__main__':