From 832597515ba0d731c93b1f59cb1044f4ef8dd9b5 Mon Sep 17 00:00:00 2001
From: Erick Peirson <erick.peirson@asu.edu>
Date: Mon, 11 Jul 2016 18:40:14 -0400
Subject: [PATCH] bringing DTM back #147 TETHNE-130

---
 setup.py                        |   1 +
 tethne/model/corpus/dtm.py      | 269 +++++++++++++++++++++++---------
 tethne/tests/test_models_dtm.py |  60 +++----
 3 files changed, 224 insertions(+), 106 deletions(-)

diff --git a/setup.py b/setup.py
index 82c8cc2b..fcdd2a2d 100644
--- a/setup.py
+++ b/setup.py
@@ -51,5 +51,6 @@
         "slate",
         "Unidecode==0.4.17",
         "nltk",
+        "numpy==1.9.3"
     ],
 )
diff --git a/tethne/model/corpus/dtm.py b/tethne/model/corpus/dtm.py
index 4c773daf..3de6b64b 100644
--- a/tethne/model/corpus/dtm.py
+++ b/tethne/model/corpus/dtm.py
@@ -5,13 +5,19 @@
 from tethne.model import Model
 
 import os, sys, re, shutil, tempfile, subprocess, csv, platform, inspect
+try:
+    import numpy as np
+except ImportError:
+    raise ImportError('DTMModel requires Numpy')
+
 from collections import defaultdict
 
 TETHNE_PATH = os.path.join(os.path.dirname(os.path.abspath(inspect.stack()[0][1])), '..', '..')
 DTM_PATH = os.path.join(TETHNE_PATH, 'bin', 'dtm')
 
 
-def _to_dtm_input(corpus, target, featureset_name, fields=['date','atitle'], **slice_kwargs):
+def _to_dtm_input(corpus, target, featureset_name, fields=['date','atitle'],
+                  **slice_kwargs):
     """
 
     Parameters
@@ -99,10 +105,12 @@ def _to_dtm_input(corpus, target, featureset_name, fields=['date','atitle'], **s
     #       ...
     #       number_docs_time_NumberTimestamps
     #
+    years = []
     with open(target + '-seq.dat', 'wb') as seqFile:
         seqFile.write(str(len(seq)) + '\n')
         for year, papers in sorted(seq.items()):
             seqFile.write('{0}\n'.format(len(papers)))
+            years.append(year)
 
     #       a file with all of the words in the vocabulary, arranged in
     #       the same order as the word indices
@@ -110,7 +118,7 @@ def _to_dtm_input(corpus, target, featureset_name, fields=['date','atitle'], **s
         for index, word in sorted(vocab.items()):
             vocabFile.write('{0}\n'.format(word))
 
-    return len(seq)
+    return years, len(seq)
 
 
 class DTMModel(Model):
@@ -180,54 +188,36 @@ def run(self, **kwargs):
 
         while p.poll() is None:
             l = p.stderr.readline()
-            print l
-            try:    # Find the LL
-                this_ll = float(re.findall(r'^lhood\s+=\s+([-]?\d+\.\d+)', l)[0])
-                self.ll.append(this_ll)
-
+            match = re.match(r'^lhood\s+=\s+([-]?\d+\.\d+)', l)
+            if match:   # Find the LL
+                self.ll.append(match.groups()[0])
                 self.ll_iters.append(i)
                 i += 1
-            except IndexError:
-                pass
-
-            try:    # Find conv
-                conv = re.findall(r'conv\s+=\s+([-]?\d+\.\d+e[-]\d+)', l)
-                self.conv.append(float(conv[0]))
 
-                progress = int(100 * float(len(self.conv))/float(max_v))
-
-            except IndexError:
-                pass
         self.load()
 
-        # self.max_iter += lda_max_em_iter   # TODO: does this make sense?
-
-
     def _generate_corpus(self, **slice_kwargs):
         """
         Writes a corpus to disk amenable to DTM.
         """
 
-        self.N = _to_dtm_input(self.corpus, self.temp+'/tethne', featureset_name=self.featureset_name)
+        self.years, self.N = _to_dtm_input(self.corpus, self.temp+'/tethne',
+                                           featureset_name=self.featureset_name,
+                                           **slice_kwargs)
 
     def load(self):
         """Load and return a :class:`.DTMModel`\."""
 
-        self.e_theta, self.phi, self.metadata, self.vocabulary = from_gerrish(self.outname, self.meta_path, self.vocab_path)
+        result = from_gerrish(self.outname, self.meta_path, self.vocab_path)
+        self.e_theta, self.phi, self.metadata, self.vocabulary = result
 
-        self.Z = e_theta.shape[0]   # Number of topics.
-        self.M = e_theta.shape[1]   # Number of documents.
-
-        self.W = phi.shape[1]    # Number of words.
-        self.T = phi.shape[2]    # Number of time periods.
-
-        self.lookup = { v['id']:k for k,v in metadata.iteritems() }
-
-        logging.debug('DTMModel.__init__(): loaded model with' + \
-                   ' {0} topics, {1} documents,'.format(self.Z, self.M) + \
-                   ' {0} words, {1} time periods.'.format(self.W, self.T))
+        self.Z = self.e_theta.shape[0]   # Number of topics.
+        self.M = self.e_theta.shape[1]   # Number of documents.
 
+        self.W = self.phi.shape[1]    # Number of words.
+        self.T = self.phi.shape[2]    # Number of time periods.
 
+        self.lookup = {v['id']:k for k,v in self.metadata.iteritems()}
 
     def _item_description(self, i, **kwargs):
         """
@@ -269,8 +259,6 @@ def _dimension_items(self, k, threshold, **kwargs):
 
     def topic_evolution(self, k, Nwords=5):
         """
-        Generate a plot that shows p(w|z) over time for the top ``Nwords``
-        terms.
 
         Parameters
         ----------
@@ -288,25 +276,19 @@ def topic_evolution(self, k, Nwords=5):
         """
 
         t_keys = range(self.T)
-        t_values = {}
+        t_values = defaultdict(dict)
         for t in t_keys:
             dim = self.dimension(k, t=t, top=Nwords)
-            for w,p in dim:
-                if w not in t_values:
-                    t_values[w] = {}
+            for w, p in dim:
                 t_values[w][t] = p
 
-        t_series = {}
+        t_series = defaultdict(list)
         for w, values in t_values.iteritems():
             word = self.vocabulary[w]
-            series = []
             for t in t_keys:
-                if t in values:
-                    series.append(values[t])
-                else:   # No value for that time-period.
-                    series.append(0.)
-            t_series[word] = series
+                t_series[word].append(values[t] if t in values else 0.)
 
+        t_keys = getattr(self, 'years', t_keys)
         return t_keys, t_series
 
     def list_topic(self, k, t, Nwords=10):
@@ -327,15 +309,12 @@ def list_topic(self, k, t, Nwords=10):
         as_list : list
             List of words in topic.
         """
-        words = self.dimension(k, t=t, top=Nwords)
-        as_list = [ self.vocabulary[w] for w,p in words ]
 
-        return as_list
+        words = self.dimension(k, t=t, top=Nwords)
+        return [self.vocabulary[w] for w, p in words]
 
     def list_topic_diachronic(self, k, Nwords=10):
-        as_dict = { t:self.list_topic(k, t, Nwords)
-                        for t in xrange(self.T) }
-        return as_dict
+        return {t: self.list_topic(k, t, Nwords) for t in xrange(self.T)}
 
     def print_topic_diachronic(self, k, Nwords=10):
         as_dict = self.list_topic_diachronic(k, Nwords)
@@ -359,15 +338,9 @@ def print_topic(self, k, t, Nwords=10):
         Nwords : int
             Number of words to return.
 
-        Returns
-        -------
-        as_string : str
-            Joined list of words in topic.
         """
 
-        as_string = ', '.join(self.list_topic(k, t=t, Nwords=Nwords))
-
-        print as_string
+        print u', '.join(self.list_topic(k, t=t, Nwords=Nwords))
 
     def list_topics(self, t, Nwords=10):
         """
@@ -382,15 +355,11 @@ def list_topics(self, t, Nwords=10):
 
         Returns
         -------
-        as_dict : dict
+        dict
             Keys are topic indices, values are list of words.
         """
 
-        as_dict = {}
-        for k in xrange(self.Z):
-            as_dict[k] = self.list_topic(k, t, Nwords)
-
-        return as_dict
+        return {k: self.list_topic(k, t, Nwords) for k in xrange(self.Z)}
 
     def print_topics(self, t, Nwords=10):
         """
@@ -409,13 +378,169 @@ def print_topics(self, t, Nwords=10):
             Newline-delimited lists of words for each topic.
         """
 
-        as_dict = self.list_topics(t, Nwords)
-        s = []
-        for key, value in as_dict.iteritems():
-            s.append('{0}: {1}'.format(key, ', '.join(value)))
-        as_string = '\n'.join(s)
 
-        print as_string
+        print u'\n'.join([u'{0}: {1}'.format(key, u', '.join(value))
+                          for key, value
+                          in self.list_topics(t, Nwords).iteritems()])
+
+    def item(self, i, top=None, **kwargs):
+        """
+        Describes an item in terms of dimensions and weights.
+
+        Subclass must provide ``_item_description(i)`` method.
+
+        Parameters
+        ----------
+        i : int
+            Index for an item.
+        top : int
+            (optional) Number of (highest-w) dimensions to return.
+
+        Returns
+        -------
+        description : list
+            A list of ( dimension , weight ) tuples.
+        """
+
+        try:
+            description = self._item_description(i, **kwargs)
+        except KeyError:
+            raise KeyError('No such item index in this model.')
+        except AttributeError:
+            raise NotImplementedError('_item_description() not implemented' + \
+                                      ' for this model class.')
+
+        # Optionally, select only the top-weighted dimensions.
+        if type(top) is int:
+            D, W = zip(*description) # Dimensions and Weights.
+            D = list(D)     # To support element deletion, below.
+            W = list(W)
+            top_description = []
+            while len(top_description) < top:   # Avoiding Numpy argsort.
+                d = W.index(max(W)) # Index of top weight.
+                top_description.append((D[d], W[d]))
+                del D[d], W[d]
+            return top_description
+        return description
+
+    def item_relationship(self, i, j, **kwargs):
+        """
+        Describes the relationship between two items.
+
+        Subclass must provide ``_item_relationship(i, j)`` method.
+
+        Parameters
+        ----------
+        i : int
+            Item index.
+        j : int
+            Item index.
+
+        Returns
+        -------
+        list
+            A list of ( dimension ,  weight ) tuples.
+        """
+
+        try:
+            return self._item_relationship(i, j, **kwargs)
+        except AttributeError:
+            raise NotImplementedError('_item_relationship() not implemented' \
+                                      + ' for this model class.')
+
+    def dimension(self, d, top=None, asmatrix=False, **kwargs):
+        """
+        Describes a dimension (eg a topic).
+
+        Subclass must provide ``_dimension_description(d)`` method.
+
+        Parameters
+        ----------
+        d : int
+            Dimension index.
+
+        Returns
+        -------
+        description : list
+            A list of ( feature, weight ) tuples (e.g. word, prob ).
+        """
+
+        try:
+            description = self._dimension_description(d, **kwargs)
+        except AttributeError:
+            raise NotImplementedError('_dimension_description() not' + \
+                                      ' implemented for this model class.')
+
+        # Optionally, select only the top-weighted dimensions.
+        if type(top) is int:
+            D, W = zip(*description) # Dimensions and Weights.
+            D = list(D)     # To support element deletion, below.
+            W = list(W)
+            top_description = []
+            while len(top_description) < top:   # Avoiding Numpy argsort.
+                d = W.index(max(W)) # Index of top weight.
+                top_description.append((D[d], W[d]))
+                del D[d], W[d]
+
+            description = top_description
+
+        if asmatrix:
+            J,K = zip(*description)
+            I = [ d for i in xrange(len(J)) ]
+            mat = coo_matrix(list(K), (I,list(J))).tocsc()
+            return mat
+
+        return description
+
+    def dimension_items(self, d, threshold, **kwargs):
+        """
+        Describes a dimension in terms of the items that contain it.
+
+        Subclass must provide ``_dimension_items(d, threshold)`` method.
+
+        Parameters
+        ----------
+        d : int
+            Dimension index.
+        threshold : float
+            Minimum representation of ``d`` in item.
+
+        Returns
+        -------
+        description : list
+            A list of ( item, weight ) tuples.
+        """
+
+        try:
+            return self._dimension_items(d, threshold, **kwargs)
+        except AttributeError:
+            raise NotImplementedError('_dimension_items() not implemented for' \
+                                      + ' this model class.')
+
+    def dimension_relationship(self, d, e, **kwargs):
+        """
+        Describes the relationship between two dimensions.
+
+        Subclass must provide ``_dimension_relationship(d, e)`` method.
+
+        Parameters
+        ----------
+        d : int
+            Dimension index.
+        e : int
+            Dimension index.
+
+        Returns
+        -------
+        relationship : list
+            A list of ( factor ,  weight ) tuples.
+        """
+
+        try:
+            return self._dimension_relationship(d, e, **kwargs)
+        except AttributeError:
+            raise NotImplementedError('_dimension_relationship() not' \
+                                      + ' implemented for this model class.')
 
 
 
@@ -517,7 +642,7 @@ def load(self):
                 self.handler[fs[-2]](fname, z)
 
         tkeys = sorted(self.tdict.keys())
-        self.phi = np.array( [ self.tdict[z] for z in tkeys ])
+        self.phi = np.array([self.tdict[z] for z in tkeys])
 
         return self.e_theta, self.phi, self.metadata, self.vocabulary
 
diff --git a/tethne/tests/test_models_dtm.py b/tethne/tests/test_models_dtm.py
index d76d2d22..5cbfc5da 100644
--- a/tethne/tests/test_models_dtm.py
+++ b/tethne/tests/test_models_dtm.py
@@ -26,16 +26,20 @@
 from tethne.model.corpus.dtm import _to_dtm_input
 
 
+def _cleanUp(basepath):
+    for fname in ['meta', 'mult', 'seq', 'vocab']:
+        try:
+            os.remove(basepath + '-%s.dat' % fname)
+        except OSError:
+            pass
+
+
 class TestToDTMInput(unittest.TestCase):
     def setUp(self):
         self.corpus = read(datapath, index_by='wosid')
         self.basepath = os.path.join(sandbox, 'dtm_test')
         self.corpus.index_feature('abstract', word_tokenize)
-        for fname in ['meta', 'mult', 'seq', 'vocab']:
-            try:
-                os.remove(self.basepath + '-%s.dat' % fname)
-            except OSError:
-                pass
+        _cleanUp(self.basepath)
 
     def test_to_dtm_input(self):
         _to_dtm_input(self.corpus, self.basepath, 'abstract')
@@ -44,8 +48,7 @@ def test_to_dtm_input(self):
             self.assertTrue(os.path.exists(self.basepath + '-%s.dat' % fname))
 
     def tearDown(self):
-        for fname in ['meta', 'mult', 'seq', 'vocab']:
-            os.remove(self.basepath + '-%s.dat' % fname)
+        _cleanUp(self.basepath)
 
 
 class TestDTMModel(unittest.TestCase):
@@ -53,39 +56,28 @@ def setUp(self):
         self.corpus = read(datapath, index_by='wosid')
         self.basepath = os.path.join(sandbox, 'dtm_test')
         self.corpus.index_feature('abstract', word_tokenize)
-        for fname in ['meta', 'mult', 'seq', 'vocab']:
-            try:
-                os.remove(self.basepath + '-%s.dat' % fname)
-            except OSError:
-                pass
+        _cleanUp(self.basepath)
 
     def test_init(self):
         self.model = DTMModel(self.corpus, featureset_name='abstract')
 
     def test_fit(self):
         self.model = DTMModel(self.corpus, featureset_name='abstract')
-        self.model.fit(Z=20)
-
-# class TestLDAModel(unittest.TestCase):
-#     def setUp(self):
-#         from tethne.model.corpus.dtm import DTMModel
-#         corpus = read(datapath, index_by='wosid')
-#         corpus.index_feature('abstract', tokenize, structured=True)
-#         self.model = LDAModel(corpus, featureset_name='abstract')
-#         self.model.fit(Z=20, max_iter=500)
-#
-#     def test_ldamodel(self):
-#         dates, rep = self.model.topic_over_time(1)
-#         self.assertGreater(sum(rep), 0)
-#         self.assertEqual(len(dates), len(rep))
-#
-#         self.assertIsInstance(self.model.phi, FeatureSet)
-#         self.assertIsInstance(self.model.theta, FeatureSet)
-#
-#         self.assertIsInstance(self.model.list_topics(), list)
-#         self.assertGreater(len(self.model.list_topics()), 0)
-#         self.assertIsInstance(self.model.list_topic(0), list)
-#         self.assertGreater(len(self.model.list_topic(0)), 0)
+        self.model.fit(Z=5)
+
+        self.assertEqual(self.model.e_theta.shape, (5, 220))
+        self.assertEqual(self.model.phi.shape, (5, 7429, 12))
+
+        keys, values =  self.model.topic_evolution(0)
+        self.assertEqual(keys, self.corpus.indices['date'].keys())
+
+        self.model.list_topic(0, 0)
+        self.model.list_topic_diachronic(0)
+        self.model.list_topics(0)
+
+    def tearDown(self):
+        _cleanUp(self.basepath)
+
 
 
 if __name__ == '__main__':