Merge pull request #618 from amath-idm/update-summary

Update summary
InstituteforDiseaseModeling · Sep 27, 2023 · 9fe3aed · 9fe3aed
2 parents 84929ff + 1548f1c
commit 9fe3aed
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 54 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,6 +9,11 @@ All notable changes to the codebase are documented in this file. Changes that ma
    :depth: 1
 
 
+Version 1.2.7 (2023-09-22)
+---------------------------
+- Updates ``sim.summary`` to have more useful information
+- *Github info* PR `618 <https://github.com/amath-idm/hpvsim/pull/618>`__
+
 Version 1.2.6 (2023-09-22)
 ---------------------------
 - Fixes plotting issue with MultiSims and Jupyter notebooks
@@ -17,27 +22,27 @@ Version 1.2.6 (2023-09-22)
 
 Version 1.2.5 (2023-09-21)
 ---------------------------
-- Fixes file path when run via Jupyter.
+- Fixes file path when run via Jupyter
 - *Github info* PR `610 <https://github.com/amath-idm/hpvsim/pull/610>`__
 
 Version 1.2.4 (2023-09-19)
 ---------------------------
-- Fixes Matplotlib regression in plotting.
+- Fixes Matplotlib regression in plotting
 - *Github info* PR `609 <https://github.com/amath-idm/hpvsim/pull/609>`__
 
 Version 1.2.3 (2023-08-30)
 ---------------------------
-- Updates data loading to be much more efficient.
+- Updates data loading to be much more efficient
 - *Github info* PR `604 <https://github.com/amath-idm/hpvsim/pull/604>`__
 
 Version 1.2.2 (2023-08-11)
 ---------------------------
-- Improved tests and included ``conda`` environment specification.
+- Improved tests and included ``conda`` environment specification
 - *Github info* PR `598 <https://github.com/amath-idm/hpvsim/pull/598>`__
 
 Version 1.2.1 (2023-07-09)
 ---------------------------
-- Updated data files being used. 
+- Updated data files being used
 - *Github info* PR `586 <https://github.com/amath-idm/hpvsim/pull/586>`__
 
 Version 1.2.0 (2023-05-31)

diff --git a/hpvsim/base.py b/hpvsim/base.py
@@ -405,7 +405,7 @@ def result_keys(self, which='all'):
         '''
         Get the actual results objects, not other things stored in sim.results.
 
-        If which is 'main', return only the main results keys. If 'genotype', return
+        If which is 'total', return only the main results keys. If 'genotype', return
         only genotype keys. If 'all', return all keys.
 
         '''
@@ -568,7 +568,7 @@ def to_json(self, filename=None, keys=None, tostring=False, indent=2, verbose=Fa
 
         # Handle keys
         if keys is None:
-            keys = ['results', 'pars', 'summary']
+            keys = ['results', 'pars', 'summary', 'short_summary']
         keys = sc.promotetolist(keys)
 
         # Convert to JSON-compatible format
@@ -588,6 +588,11 @@ def to_json(self, filename=None, keys=None, tostring=False, indent=2, verbose=Fa
                     d['summary'] = dict(sc.dcp(self.summary))
                 else:
                     d['summary'] = 'Summary not available (Sim has not yet been run)'
+            elif key == 'short_summary':
+                if self.results_ready:
+                    d['short_summary'] = dict(sc.dcp(self.short_summary))
+                else:
+                    d['short_summary'] = 'Full summary not available (Sim has not yet been run)'
             else: # pragma: no cover
                 try:
                     d[key] = sc.sanitizejson(getattr(self, key))
@@ -612,7 +617,7 @@ def to_df(self, date_index=False):
         '''
         resdict = self.export_results(for_json=False)
         resdict = {k:v for k,v in resdict.items() if v.ndim == 1}
-        df = pd.DataFrame.from_dict(resdict)
+        df = sc.dataframe.from_dict(resdict)
         df['year'] = self.res_yearvec
         new_columns = ['t','year'] + df.columns[1:-1].tolist() # Get column order
         df = df.reindex(columns=new_columns) # Reorder so 't' and 'date' are first
@@ -641,7 +646,7 @@ def to_excel(self, filename=None, skip_pars=None):
         # Export parameters
         pars = {str(k):sc.dcp(v) for k,v in self.pars.items() if k not in skip_pars}
         pars['immunity_map'] = {str(k):v for k,v in pars['immunity_map'].items()}
-        par_df = pd.DataFrame.from_dict(sc.flattendict(pars, sep='_'), orient='index', columns=['Value'])
+        par_df = sc.dataframe.from_dict(sc.flattendict(pars, sep='_'), orient='index', columns=['Value'])
         par_df.index.name = 'Parameter'
 
         # Convert to spreadsheet
@@ -777,7 +782,7 @@ def _get_ia(self, which, label=None, partial=False, as_list=False, as_inds=False
         n_ia = len(ia_list) # Number of interventions/analyzers
 
         if label == 'summary': # Print a summary of the interventions
-            df = pd.DataFrame(columns=['ind', 'label', 'type'])
+            df = sc.dataframe(columns=['ind', 'label', 'type'])
             for ind,ia_obj in enumerate(ia_list):
                 df = df.append(dict(ind=ind, label=str(ia_obj.label), type=type(ia_obj)), ignore_index=True)
             print(f'Summary of {which}:')
@@ -1508,7 +1513,7 @@ def indices(self):
 
     def to_df(self):
         ''' Convert to a Pandas dataframe '''
-        df = pd.DataFrame.from_dict({key:self[key] for key in self.keys()})
+        df = sc.dataframe.from_dict({key:self[key] for key in self.keys()})
         return df
 
     def to_arr(self):

diff --git a/hpvsim/sim.py b/hpvsim/sim.py
@@ -387,7 +387,6 @@ def init_genotypes(self, upper_dysp_lim=200):
         # Do any precomputations for the genotype transformation functions
         t_step = self['dt']
         t_sequence = np.arange(0, upper_dysp_lim, t_step)
-        timesteps = t_sequence / t_step
         cumdysp = dict()
         for g in range(self['n_genotypes']):
             sev_fn = self['genotype_pars'][g]['sev_fn']
@@ -463,14 +462,14 @@ def init_res(*args, **kwargs):
             results[f'n_{stock.name}_by_genotype']  = init_res(stock.label+' by genotype', n_rows=ng)
 
         # Only by-age stock result we will need is number infectious, susceptible, and with cin, for HPV and CIN prevalence/incidence calculations
-        results[f'n_infectious_by_age']             = init_res('Number infectious by age', n_rows=na, color=stock.color)
-        results[f'n_females_infectious_by_age']     = init_res('Number of females infectious by age', n_rows=na, color=stock.color)
-        results[f'n_susceptible_by_age']            = init_res('Number susceptible by age', n_rows=na, color=stock.color)
-        results[f'n_transformed_by_age']            = init_res('Number transformed by age', n_rows=na, color=stock.color)
-        results[f'n_precin_by_age']                 = init_res('Number Pre-CIN by age', n_rows=na, color=stock.color)
-        results[f'n_cin1_by_age']                   = init_res('Number CIN1 by age', n_rows=na, color=stock.color)
-        results[f'n_cin2_by_age']                   = init_res('Number CIN2 by age', n_rows=na, color=stock.color)
-        results[f'n_cin3_by_age']                   = init_res('Number CIN3 by age', n_rows=na, color=stock.color)
+        results['n_infectious_by_age']             = init_res('Number infectious by age', n_rows=na, color=stock.color)
+        results['n_females_infectious_by_age']     = init_res('Number of females infectious by age', n_rows=na, color=stock.color)
+        results['n_susceptible_by_age']            = init_res('Number susceptible by age', n_rows=na, color=stock.color)
+        results['n_transformed_by_age']            = init_res('Number transformed by age', n_rows=na, color=stock.color)
+        results['n_precin_by_age']                 = init_res('Number Pre-CIN by age', n_rows=na, color=stock.color)
+        results['n_cin1_by_age']                   = init_res('Number CIN1 by age', n_rows=na, color=stock.color)
+        results['n_cin2_by_age']                   = init_res('Number CIN2 by age', n_rows=na, color=stock.color)
+        results['n_cin3_by_age']                   = init_res('Number CIN3 by age', n_rows=na, color=stock.color)
 
         # Create incidence and prevalence results
         for var,name,color in zip(hpd.inci_keys, hpd.inci_names, hpd.inci_colors):
@@ -758,7 +757,6 @@ def step(self):
         dt = self['dt'] # Timestep
         t = self.t
         ng = self['n_genotypes']
-        na = len(self.pars['age_bin_edges']) - 1 # Number of age bins
         condoms = self['condoms']
         eff_condoms = self['eff_condoms']
         beta = self['beta']
@@ -885,20 +883,20 @@ def step(self):
             f_inds = hpu.true(people['sex']==0)
             infinds = hpu.true(people['infectious'])
             f_infinds = np.intersect1d(f_inds, infinds)
-            self.results[f'n_females_infectious_by_age'][:, idx] = np.histogram(people.age[f_infinds], bins=people.age_bin_edges, weights=people.scale[f_infinds])[0]
-            self.results[f'n_infectious_by_age'][:, idx] = np.histogram(people.age[infinds], bins=people.age_bin_edges, weights=people.scale[infinds])[0]
             susinds = hpu.true(people['susceptible'])
-            self.results[f'n_susceptible_by_age'][:, idx] = np.histogram(people.age[susinds], bins=people.age_bin_edges, weights=people.scale[susinds])[0]
             transformedinds = hpu.true(people['transformed'])
-            self.results[f'n_transformed_by_age'][:, idx] = np.histogram(people.age[transformedinds], bins=people.age_bin_edges, weights=people.scale[transformedinds])[0]
             precininds = hpu.true(people['precin'])
-            self.results[f'n_precin_by_age'][:, idx] = np.histogram(people.age[precininds], bins=people.age_bin_edges, weights=people.scale[precininds])[0]
             cin1inds = hpu.true(people['cin1'])
-            self.results[f'n_cin1_by_age'][:, idx] = np.histogram(people.age[cin1inds], bins=people.age_bin_edges, weights=people.scale[cin1inds])[0]
             cin2inds = hpu.true(people['cin2'])
-            self.results[f'n_cin2_by_age'][:, idx] = np.histogram(people.age[cin2inds], bins=people.age_bin_edges, weights=people.scale[cin2inds])[0]
             cin3inds = hpu.true(people['cin3'])
-            self.results[f'n_cin3_by_age'][:, idx] = np.histogram(people.age[cin3inds], bins=people.age_bin_edges, weights=people.scale[cin3inds])[0]
+            self.results['n_females_infectious_by_age'][:, idx] = np.histogram(people.age[f_infinds], bins=people.age_bin_edges, weights=people.scale[f_infinds])[0]
+            self.results['n_infectious_by_age'][:, idx]  = np.histogram(people.age[infinds], bins=people.age_bin_edges, weights=people.scale[infinds])[0]
+            self.results['n_susceptible_by_age'][:, idx] = np.histogram(people.age[susinds], bins=people.age_bin_edges, weights=people.scale[susinds])[0]
+            self.results['n_transformed_by_age'][:, idx] = np.histogram(people.age[transformedinds], bins=people.age_bin_edges, weights=people.scale[transformedinds])[0]
+            self.results['n_precin_by_age'][:, idx] = np.histogram(people.age[precininds], bins=people.age_bin_edges, weights=people.scale[precininds])[0]
+            self.results['n_cin1_by_age'][:, idx] = np.histogram(people.age[cin1inds], bins=people.age_bin_edges, weights=people.scale[cin1inds])[0]
+            self.results['n_cin2_by_age'][:, idx] = np.histogram(people.age[cin2inds], bins=people.age_bin_edges, weights=people.scale[cin2inds])[0]
+            self.results['n_cin3_by_age'][:, idx] = np.histogram(people.age[cin3inds], bins=people.age_bin_edges, weights=people.scale[cin3inds])[0]
 
             # Create total stocks
             for key in self.people.meta.genotype_stock_keys:
@@ -1156,9 +1154,20 @@ def safedivide(num,denom):
         self.results['cum_cancer_treated'][:] = np.cumsum(self.results['new_cancer_treatments'][:], axis=0)
 
         return
-
-
-    def compute_summary(self, t=None, update=True, output=False, require_run=False):
+
+
+    def compute_age_mean(self, reskey, t=None):
+        if t is None:
+            t = -1
+        assert 'by_age' in reskey, 'This method can only be used for age results'
+        res = self.results[reskey][:,t]
+        edges = self['age_bin_edges']
+        mean_edges = edges[:-1] + np.diff(edges)/2
+        age_mean = ((res/res.sum())*mean_edges).sum()
+        return age_mean
+
+
+    def compute_summary(self, t=None, update=True, output=False, full=False, require_run=False):
         '''
         Compute the summary dict and string for the sim. Used internally; see
         sim.summarize() for the user version.
@@ -1177,17 +1186,30 @@ def compute_summary(self, t=None, update=True, output=False, require_run=False):
             errormsg = 'Simulation not yet run'
             raise RuntimeError(errormsg)
 
+        s = sc.odict()
+        s['total HPV infections'] = self.results['infections'].sum()
+        s['total cancers'] = self.results['cancers'].sum()
+        s['total cancer deaths'] = self.results['cancer_deaths'].sum()
+        s['mean HPV prevalence (%)'] = self.results['hpv_prevalence'].mean()*100
+        s['mean cancer incidence (per 100k)'] = self.results['cancer_incidence'].mean()
+        s['mean age of infection (years)'] = self.compute_age_mean('infections_by_age', t=t)
+        s['mean age of cancer (years)'] = self.compute_age_mean('cancers_by_age', t=t)
+
         summary = sc.objdict()
         for key in self.result_keys('total'):
             summary[key] = self.results[key][t]
 
         # Update the stored state
         if update:
+            self.short_summary = s
             self.summary = summary
 
         # Optionally return
         if output:
-            return summary
+            if full:
+                return summary
+            else:
+                return s
         else:
             return
 
@@ -1219,16 +1241,13 @@ def summarize(self, full=False, t=None, sep=None, output=False):
         if sep is None: sep = hpo.sep # Default separator
         labelstr = f' "{self.label}"' if self.label else ''
         string = f'Simulation{labelstr} summary:\n'
-        for key in self.result_keys('total'):
-            val = summary[key]
-            printval = f'   {val:10,.0f} '
-            label = self.results[key].name.lower().replace(',', sep)
-            if 'incidence' in key or 'prevalence' in key:
-                if key in ['hpv_prevalence', 'hpv_incidence']:
-                    printval = f'   {val*100:10.2f} '
-                    label += ' (/100)'
-                else:
-                    label += ' (/100,000)'
+        for label,val in summary.items():
+            if 'total' in label:
+                printval = f'   {val:13,.0f} '
+            elif 'mean' in label:
+                printval = f'   {val:13,.2f} '
+            else:
+                raise NotImplementedError
             string += printval + label + '\n'
 
         # Print or return string
@@ -1263,7 +1282,7 @@ class AlreadyRunError(RuntimeError):
     pass
 
 
-def diff_sims(sim1, sim2, skip_key_diffs=False, skip=None, output=False, die=False):
+def diff_sims(sim1, sim2, skip_key_diffs=False, skip=None, full=False, output=False, die=False):
     '''
     Compute the difference of the summaries of two simulations, and print any
     values which differ.
@@ -1273,6 +1292,7 @@ def diff_sims(sim1, sim2, skip_key_diffs=False, skip=None, output=False, die=Fal
         sim2 (sim/dict): ditto
         skip_key_diffs (bool): whether to skip keys that don't match between sims
         skip (list): a list of values to skip
+        full (bool): whether to use the full summary (else, brief)
         output (bool): whether to return the output as a string (otherwise print)
         die (bool): whether to raise an exception if the sims don't match
         require_run (bool): require that the simulations have been run
@@ -1285,9 +1305,9 @@ def diff_sims(sim1, sim2, skip_key_diffs=False, skip=None, output=False, die=Fal
     '''
 
     if isinstance(sim1, Sim):
-        sim1 = sim1.compute_summary(update=False, output=True, require_run=True)
+        sim1 = sim1.compute_summary(update=False, output=True, require_run=True, full=full)
     if isinstance(sim2, Sim):
-        sim2 = sim2.compute_summary(update=False, output=True, require_run=True)
+        sim2 = sim2.compute_summary(update=False, output=True, require_run=True, full=full)
     for sim in [sim1, sim2]:
         if not isinstance(sim, dict): # pragma: no cover
             errormsg = f'Cannot compare object of type {type(sim)}, must be a sim or a sim.summary dict'

diff --git a/hpvsim/version.py b/hpvsim/version.py
@@ -4,6 +4,6 @@
 
 __all__ = ['__version__', '__versiondate__', '__license__']
 
-__version__ = '1.2.6'
+__version__ = '1.2.7'
 __versiondate__ = '2023-09-22'
 __license__ = f'HPVsim {__version__} ({__versiondate__}) — © 2023 by IDM'
diff --git a/tests/test_baselines.py b/tests/test_baselines.py
@@ -6,7 +6,6 @@
 import numpy as np
 import sciris as sc
 import hpvsim as hpv
-import pandas as pd
 
 do_plot = 1
 do_save = 0
@@ -15,9 +14,6 @@
 parameters_filename = sc.thisdir(hpv.__file__, 'regression', f'pars_v{hpv.__version__}.json')
 hpv.options.set(interactive=False) # Assume not running interactively
 
-testdir = sc.thisdir(aspath=True)
-tempdir = testdir/'temp'
-tempdir.mkdir(exist_ok=True)
 
 def make_sim(use_defaults=False, do_plot=False, **kwargs):
     '''
@@ -101,12 +97,10 @@ def test_baseline():
 
     # Calculate new baseline
     new = make_sim()
-    new.to_json(tempdir/'baseline_pre_run.json')  # Check that exporting works before results have been generated
     new.run()
-    new.to_json(tempdir/'baseline_post_run.json')  # Check that exporting works before results have been generated
 
     # Compute the comparison
-    hpv.diff_sims(old, new, die=True)
+    hpv.diff_sims(old, new, full=True, die=True)
 
     return new