Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic modifications needed to read CPS input data #1484

Merged
merged 15 commits into from
Jul 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ include taxcalc/behavior.json
include taxcalc/growth.json
include taxcalc/consumption.json
include taxcalc/records_variables.json
include taxcalc/cps.csv.gz
include taxcalc/cps_weights.csv.gz
7 changes: 6 additions & 1 deletion RELEASES.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@ Release 0.9.2 on 2017-??-??
- None

**New Features**
- None
- Add several taxcalc/reforms/earnings_shifting.* files that analyze the revenue implications of high-paid workers forming personal LLCs to contract with their former employers under the Trump2017.json reform
[[#1464](https://github.com/open-source-economics/Tax-Calculator/pull/1464)
by Martin Holmer]
- Add ability to read and calculate taxes with new CPS input data for 2014 and subsequent years
[[#1484](https://github.com/open-source-economics/Tax-Calculator/pull/1484)
by Martin Holmer]

**Bug Fixes**
- Fix decorators bug that appeared when numpy 1.13.1, and pandas 0.20.2 that uses numpy 1.13, recently became available
Expand Down
2 changes: 1 addition & 1 deletion conda.recipe/install_local_taxcalc_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ rmdir dist/
rm -fr taxcalc.egg-info/*
rmdir taxcalc.egg-info/

echo "Execute 'conda uninstall taxcalc' after using taxcalc package"
echo "Execute 'conda uninstall taxcalc --yes' after using taxcalc package"

echo "FINISHED : `date`"
exit 0
2 changes: 1 addition & 1 deletion docs/index.html

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions taxcalc/calculate.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def __init__(self, policy=None, records=None, verbose=True,
self.records = records
else:
raise ValueError('must specify records as a Records object')
if self.policy.current_year < self.records.data_year:
self.policy.set_year(self.records.data_year)
if consumption is None:
self.consumption = Consumption(start_year=policy.start_year)
elif isinstance(consumption, Consumption):
Expand All @@ -120,10 +122,10 @@ def __init__(self, policy=None, records=None, verbose=True,
self.behavior.set_year(next_year)
else:
raise ValueError('behavior must be None or Behavior object')
if sync_years and self.records.current_year == Records.PUF_YEAR:
if sync_years and self.records.current_year == self.records.data_year:
if verbose:
print('You loaded data for ' +
str(self.records.current_year) + '.')
str(self.records.data_year) + '.')
if len(self.records.IGNORED_VARS) > 0:
print('Your data include the following unused ' +
'variables that will be ignored:')
Expand All @@ -143,6 +145,7 @@ def calc_all(self, zero_out_calc_vars=False):
Call all tax-calculation functions.
"""
# conducts static analysis of Calculator object for current_year
assert self.records.current_year == self.policy.current_year
self._calc_one_year(zero_out_calc_vars)
BenefitSurtax(self)
BenefitLimitation(self)
Expand Down
4 changes: 3 additions & 1 deletion taxcalc/cli/tc.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ def cli_tc_main():
parser.add_argument('INPUT', nargs='?',
help=('INPUT is name of CSV-formatted file that '
'contains for each filing unit variables used '
'to compute taxes for TAXYEAR.'),
'to compute taxes for TAXYEAR. Specifying '
'"cps.csv" uses CPS input files included in '
'the taxcalc package.'),
default='')
parser.add_argument('TAXYEAR', nargs='?',
help=('TAXYEAR is calendar year for which taxes '
Expand Down
Binary file added taxcalc/cps.csv.gz
Binary file not shown.
Binary file added taxcalc/cps_weights.csv.gz
Binary file not shown.
7 changes: 5 additions & 2 deletions taxcalc/growfactors.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,12 @@ def __init__(self, growfactors_filename=FILE_PATH):
# pylint: disable=redefined-variable-type
# (above because pylint mistakenly thinks gfdf is not a DataFrame)
if os.path.isfile(growfactors_filename):
gfdf = pd.read_csv(growfactors_filename, index_col='YEAR')
gfdf = pd.read_csv(growfactors_filename,
index_col='YEAR')
else:
gfdf = read_egg_csv(Growfactors.FILENAME, index_col='YEAR')
# cannot call read_egg_ function in unit tests
gfdf = read_egg_csv(Growfactors.FILENAME,
index_col='YEAR') # pragma: no cover
else:
raise ValueError('growfactors_filename is not a string')
assert isinstance(gfdf, pd.DataFrame)
Expand Down
4 changes: 3 additions & 1 deletion taxcalc/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,9 @@ def _params_dict_from_json_file(cls):
params_dict = json.load(pfile,
object_pairs_hook=collect.OrderedDict)
else:
params_dict = read_egg_json(cls.DEFAULTS_FILENAME)
# cannot call read_egg_ function in unit tests
params_dict = read_egg_json(
cls.DEFAULTS_FILENAME) # pragma: no cover
return params_dict

def _update(self, year_mods):
Expand Down
114 changes: 78 additions & 36 deletions taxcalc/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@


PUFCSV_YEAR = 2009
CPSCSV_YEAR = 2014


class Records(object):
Expand All @@ -39,20 +40,20 @@ class Records(object):
gfactors: Growfactors class instance or None
containing record data extrapolation (or "blowup") factors

adjust_ratios: string or Pandas DataFrame or None
string describes CSV file in which adjustment ratios reside;
DataFrame already contains adjustment ratios;
None creates empty adjustment-ratios DataFrame;
default value is filename of the default adjustment ratios.

weights: string or Pandas DataFrame or None
string describes CSV file in which weights reside;
DataFrame already contains weights;
None creates empty sample-weights DataFrame;
default value is filename of the default weights.
default value is filename of the PUF weights.

adjust_ratios: string or Pandas DataFrame or None
string describes CSV file in which adjustment ratios reside;
DataFrame already contains adjustment ratios;
None creates empty adjustment-ratios DataFrame;
default value is filename of the PUF adjustment ratios.

start_year: integer
specifies calendar year of the data;
specifies calendar year of the input data;
default value is PUFCSV_YEAR.
Note that if specifying your own data (see above) as being a custom
data set, be sure to explicitly set start_year to the
Expand All @@ -78,7 +79,7 @@ class instance: Records

Notes
-----
Typical usage is as follows::
Typical usage when using PUF input data is as follows::

recs = Records()

Expand All @@ -88,6 +89,9 @@ class instance: Records
situations in which you need to specify the values of the Record
constructor's arguments, but be sure you know exactly what you are
doing when attempting this.

Use Records.cps_constructor() to get a Records object instantiated
with CPS input data.
"""
# suppress pylint warnings about unrecognized Records variables:
# pylint: disable=no-member
Expand All @@ -96,23 +100,22 @@ class instance: Records
# suppress pylint warnings about too many class instance attributes:
# pylint: disable=too-many-instance-attributes

PUF_YEAR = PUFCSV_YEAR
CUR_PATH = os.path.abspath(os.path.dirname(__file__))
WEIGHTS_FILENAME = 'puf_weights.csv'
WEIGHTS_PATH = os.path.join(CUR_PATH, WEIGHTS_FILENAME)
ADJUST_RATIOS_FILENAME = 'puf_ratios.csv'
ADJUST_RATIOS_PATH = os.path.join(CUR_PATH, ADJUST_RATIOS_FILENAME)
PUF_WEIGHTS_FILENAME = 'puf_weights.csv'
PUF_RATIOS_FILENAME = 'puf_ratios.csv'
CPS_WEIGHTS_FILENAME = 'cps_weights.csv.gz'
CPS_RATIOS_FILENAME = None
VAR_INFO_FILENAME = 'records_variables.json'
VAR_INFO_PATH = os.path.join(CUR_PATH, VAR_INFO_FILENAME)

def __init__(self,
data='puf.csv',
exact_calculations=False,
gfactors=Growfactors(),
weights=WEIGHTS_PATH,
adjust_ratios=ADJUST_RATIOS_PATH,
weights=PUF_WEIGHTS_FILENAME,
adjust_ratios=PUF_RATIOS_FILENAME,
start_year=PUFCSV_YEAR):
# pylint: disable=too-many-arguments
self._data_year = start_year
# read specified data
self._read_data(data, exact_calculations)
# check that three sets of split-earnings variables have valid values
Expand Down Expand Up @@ -143,7 +146,7 @@ def __init__(self,
self.WT = None
self._read_weights(weights)
self.ADJ = None
self._read_adjust(adjust_ratios)
self._read_ratios(adjust_ratios)
# weights must be same size as tax record data
if not self.WT.empty and self.dim != len(self.WT):
# scale-up sub-sample weights by year-specific factor
Expand All @@ -160,13 +163,40 @@ def __init__(self,
msg = 'start_year is not an integer'
raise ValueError(msg)
# consider applying initial-year grow factors
if gfactors is not None and start_year == Records.PUF_YEAR:
if gfactors is not None and start_year == self._data_year:
self._blowup(start_year)
# construct sample weights for current_year
wt_colname = 'WT{}'.format(self.current_year)
if wt_colname in self.WT.columns:
self.s006 = self.WT[wt_colname] * 0.01

@staticmethod
def cps_constructor(exact_calculations=False,
growfactors=Growfactors()):
"""
Static method returns a Records object instantiated with CPS
input data. This works in a analogous way to Records(), which
returns a Records object instantiated with PUF input data.
This is a convenience method that eliminates the need to
specify all the details of the CPS input data just as the
default values of the arguments of the Records class constructor
eliminate the need to specify all the details of the PUF input
data.
"""
return Records(data=os.path.join(Records.CUR_PATH, 'cps.csv.gz'),
exact_calculations=exact_calculations,
gfactors=growfactors,
weights=Records.CPS_WEIGHTS_FILENAME,
adjust_ratios=Records.CPS_RATIOS_FILENAME,
start_year=CPSCSV_YEAR)

@property
def data_year(self):
"""
Records class original data year property.
"""
return self._data_year

@property
def current_year(self):
"""
Expand Down Expand Up @@ -206,11 +236,15 @@ def read_var_info():
Read Records variables metadata from JSON file;
returns dictionary and specifies static varname sets listed below.
"""
if os.path.exists(Records.VAR_INFO_PATH):
with open(Records.VAR_INFO_PATH) as vfile:
var_info_path = os.path.join(Records.CUR_PATH,
Records.VAR_INFO_FILENAME)
if os.path.exists(var_info_path):
with open(var_info_path) as vfile:
vardict = json.load(vfile)
else:
vardict = read_egg_json(Records.VAR_INFO_FILENAME)
# cannot call read_egg_ function in unit tests
vardict = read_egg_json(
Records.VAR_INFO_FILENAME) # pragma: no cover
Records.INTEGER_READ_VARS = set(k for k, v in vardict['read'].items()
if v['type'] == 'int')
FLOAT_READ_VARS = set(k for k, v in vardict['read'].items()
Expand Down Expand Up @@ -362,10 +396,11 @@ def _read_data(self, data, exact_calcs):
if isinstance(data, pd.DataFrame):
taxdf = data
elif isinstance(data, six.string_types):
if data.endswith('gz'):
taxdf = pd.read_csv(data, compression='gzip')
else:
if os.path.isfile(data):
taxdf = pd.read_csv(data)
else:
# cannot call read_egg_ function in unit tests
taxdf = read_egg_csv(data) # pragma: no cover
else:
msg = 'data is neither a string nor a Pandas DataFrame'
raise ValueError(msg)
Expand Down Expand Up @@ -430,36 +465,43 @@ def _read_weights(self, weights):
if isinstance(weights, pd.DataFrame):
WT = weights
elif isinstance(weights, six.string_types):
if os.path.isfile(weights):
weights_path = os.path.join(Records.CUR_PATH, weights)
if os.path.isfile(weights_path):
# pylint: disable=redefined-variable-type
# (above because pylint mistakenly thinks WT not a DataFrame)
WT = pd.read_csv(weights)
WT = pd.read_csv(weights_path)
else:
WT = read_egg_csv(Records.WEIGHTS_FILENAME)
# cannot call read_egg_ function in unit tests
WT = read_egg_csv(
os.path.basename(weights_path)) # pragma: no cover
else:
msg = 'weights is not None or a string or a Pandas DataFrame'
raise ValueError(msg)
assert isinstance(WT, pd.DataFrame)
setattr(self, 'WT', WT)

def _read_adjust(self, adjust_ratios):
def _read_ratios(self, ratios):
"""
Read Records adjustment ratios from file or uses specified DataFrame
as data or creates empty DataFrame if None
"""
if adjust_ratios is None:
if ratios is None:
ADJ = pd.DataFrame({'nothing': []})
setattr(self, 'ADJ', ADJ)
return
if isinstance(adjust_ratios, pd.DataFrame):
ADJ = adjust_ratios
elif isinstance(adjust_ratios, six.string_types):
if os.path.isfile(adjust_ratios):
if isinstance(ratios, pd.DataFrame):
ADJ = ratios
elif isinstance(ratios, six.string_types):
ratios_path = os.path.join(Records.CUR_PATH, ratios)
if os.path.isfile(ratios_path):
# pylint: disable=redefined-variable-type
# (above because pylint mistakenly thinks ADJ not a DataFrame)
ADJ = pd.read_csv(adjust_ratios, index_col=0)
ADJ = pd.read_csv(ratios_path,
index_col=0)
else:
ADJ = read_egg_csv(Records.ADJUST_RATIOS_FILENAME, index_col=0)
# cannot call read_egg_ function in unit tests
ADJ = read_egg_csv(os.path.basename(ratios_path),
index_col=0) # pragma: no cover
ADJ = ADJ.transpose()
else:
msg = ('adjust_ratios is not None or a string'
Expand Down
Loading