diff --git a/cps_data/adjustment_targets.csv b/cps_data/adjustment_targets.csv new file mode 100644 index 00000000..5d1807dd --- /dev/null +++ b/cps_data/adjustment_targets.csv @@ -0,0 +1,20 @@ +INT,ODIV,QDIV,BIZ +4688264,4713140,3095521,-8287987 +541622,999149,639115,2836970 +770262,1442485,859292,11990777 +1002678,1876844,1137469,22132182 +1147555,1798693,1137002,14556107 +1125760,1957918,1234331,10372683 +1340811,1960817,1183817,9243160 +2096189,4116883,2794332,15956819 +1900496,4673360,2996252,13332856 +5955852,14387800,9906058,28063051 +5013047,14339274,9921453,25009982 +13390182,43136765,31985381,66676016 +12725861,44531293,34400475,60723838 +6814230,24570303,19243443,22953218 +3825975,10497137,8124591,6770746 +2582189,7490783,5750856,3743423 +7098608,19187445,14940470,6606643 +4547632,11314258,8846877,2419024 +17327068,41707885,34250973,2159257 \ No newline at end of file diff --git a/cps_data/cps.csv.gz b/cps_data/cps.csv.gz new file mode 100644 index 00000000..1788976a Binary files /dev/null and b/cps_data/cps.csv.gz differ diff --git a/cps_data/cps_raw.csv.gz b/cps_data/cps_raw.csv.gz new file mode 100644 index 00000000..f7438148 Binary files /dev/null and b/cps_data/cps_raw.csv.gz differ diff --git a/cps_data/finalprep.py b/cps_data/finalprep.py index c09cca39..8ba1b930 100644 --- a/cps_data/finalprep.py +++ b/cps_data/finalprep.py @@ -1,158 +1,375 @@ import pandas as pd import numpy as np +import sys +import copy +import subprocess -# Import production file -data = pd.read_csv('prod2015_v2e.csv') - -# Rename variables where possible -renames = { - 'IFDEPT': 'DSI', - 'TAXYEAR': 'FLPDYR', - 'XXTOT': 'XTOT', - 'JCPS21': 'e00200p', - 'JCPS31': 'e00200s', - 'ALIMONY': 'e00800', - 'JCPS25': 'e00900p', - 'JCPS35': 'e00900s', - 'JCPS28': 'e02100p', - 'JCPS38': 'e02100s', - 'UCOMP': 'e02300', - 'SOCSEC': 'e02400', - 'SEHEALTH': 'e03270', - 'DPAD': 'e03240', - 'MEDICALEXP': 'e17500', - 'REALEST': 'e18500', - 'MISCITEM': 'e20400', - 'CCE': 'e32800', - 'ICPS01': 'age_head', - 'ICPS02': 'age_spouse', - 'WT': 's006', - 'FILST': 'filer', - 'SEQUENCE': 'RECID', - 'PENSIONS': 'e01700', - 'DBE': 'e00600', - 'KEOGH': 'e03300', - 'TIRAD': 'e01400' -} - -data = data.rename(columns=renames) - -# Adjust MARS to address lack of married filing separately status -# 1 = Single filers -# 2 = Married filing jointly -# 4 = Head of household -data['MARS'] = np.where(data.JS == 3, 4, data.JS) - -# Use primary taxpayer and spouse records to get total tax unit earnings -data['e00200'] = data.e00200p + data.e00200s -data['e00900'] = data.e00900p + data.e00900s -data['e02100'] = data.e02100p + data.e02100s - -# Impute variables where possible - -# Determine amount of qualified dividends using IRS ratio -data['e00650'] = data.e00600 * 0.7556 - -# Split interest income into taxable and tax exempt using IRS ratio -taxable = 0.6 -nontaxable = 1. - taxable -data['e00300'] = data.INTST * taxable -data['e00400'] = data.INTST * nontaxable - -# Apply charitable deduction limit -halfAGI = (data.JCPS9 + data.JCPS19) * 0.5 -charity = np.where(data.CHARITABLE > halfAGI, - halfAGI, data.CHARITABLE) -# Split charitable giving into cash and non-cash using ratio in PUF -cash = 0.82013 -non_cash = 1. - cash -data['e19800'] = charity * cash -data['e20100'] = charity * non_cash - -# Apply student loan interest deduction limit -data['e03210'] = np.where(data.SLINT > 2500, 2500, data.SLINT) - -# Apply IRA contribution limits -deductibleIRA = np.where(data.AGE >= 50, - np.where(data.ADJIRA > 6500, 6500, data.ADJIRA), - np.where(data.ADJIRA > 5500, 5500, data.ADJIRA)) -data['e03150'] = deductibleIRA - -# Count number of dependents under 13 -# Max of four to match PUF version of nu13 -age1 = np.where((data.ICPS03 > 0) & (data.ICPS03 <= 13), 1, 0) -age2 = np.where((data.ICPS04 > 0) & (data.ICPS04 <= 13), 1, 0) -age3 = np.where((data.ICPS05 > 0) & (data.ICPS05 <= 13), 1, 0) -age4 = np.where((data.ICPS06 > 0) & (data.ICPS06 <= 13), 1, 0) -nu13 = age1 + age2 + age3 + age4 -data['nu13'] = nu13 - -# Count number of dependents under 5 -age1 = np.where((data.ICPS03 > 0) & (data.ICPS03 <= 5), 1, 0) -age2 = np.where((data.ICPS04 > 0) & (data.ICPS04 <= 5), 1, 0) -age3 = np.where((data.ICPS05 > 0) & (data.ICPS05 <= 5), 1, 0) -age4 = np.where((data.ICPS06 > 0) & (data.ICPS06 <= 5), 1, 0) -age5 = np.where((data.ICPS07 > 0) & (data.ICPS06 <= 5), 1, 0) -nu05 = age1 + age2 + age3 + age4 + age5 -data['nu05'] = nu05 - -# Count number of children eligible for child tax credit -# Max of three to mach PUF version of n24 -age1 = np.where((data.ICPS03 > 0) & (data.ICPS03 <= 17), 1, 0) -age2 = np.where((data.ICPS04 > 0) & (data.ICPS04 <= 17), 1, 0) -age3 = np.where((data.ICPS05 > 0) & (data.ICPS05 <= 17), 1, 0) -n24 = age1 + age2 + age3 -data['n24'] = n24 - -# Count number of elderly dependents -age1 = np.where(data.ICPS03 >= 65, 1, 0) -age2 = np.where(data.ICPS04 >= 65, 1, 0) -age3 = np.where(data.ICPS05 >= 65, 1, 0) -age4 = np.where(data.ICPS06 >= 65, 1, 0) -age5 = np.where(data.ICPS07 >= 65, 1, 0) -elderly = age1 + age2 + age3 + age4 + age5 -data['elderly_dependent'] = elderly - -# List of usable variables in TaxCalc -USABLE_READ_VARS = [ - 'DSI', 'EIC', 'FLPDYR', - 'f2441', 'f6251', 'n24', 'XTOT', - 'e00200', 'e00300', 'e00400', 'e00600', 'e00650', 'e00700', 'e00800', - 'e00200p', 'e00200s', - 'e00900', 'e01100', 'e01200', 'e01400', 'e01500', 'e01700', - 'e00900p', 'e00900s', - 'e02000', 'e02100', 'e02300', 'e02400', 'e03150', 'e03210', - 'e02100p', 'e02100s', - 'e03220', 'e03230', 'e03270', 'e03240', 'e03290', - 'e03400', 'e03500', - 'e07240', 'e07260', 'e07300', - 'e07400', 'e07600', 'p08000', - 'e09700', 'e09800', 'e09900', - 'e11200', - 'e17500', 'e18400', 'e18500', - 'e19200', 'e19800', 'e20100', - 'e20400', 'e20500', 'p22250', - 'p23250', 'e24515', 'e24518', - 'p25470', - 'e26270', - 'e27200', 'e32800', 'e03300', - 'e58990', - 'e62900', - 'p87521', 'e87530', - 'MARS', 'MIDR', 'RECID', 'filer', - 'cmbtp_standard', 'cmbtp_itemizer', - 'age_head', 'age_spouse', 'blind_head', 'blind_spouse', - 'nu13', 'elderly_dependent', - 's006', 'nu05'] - -# Remove unnecessary variables -drop_vars = [] -var_list = list(data.columns) -for item in var_list: - if item not in USABLE_READ_VARS: - drop_vars.append(item) -data.drop(drop_vars, axis=1, inplace=True) -data.fillna(0, inplace=True) - -# Write processed file to a CSV -data.to_csv('cps.csv', index=False) + +def main(): + + # Import CPS data file + data = pd.read_csv('cps_raw.csv.gz', compression='gzip') + adj_targets = pd.read_csv('adjustment_targets.csv') + # other_ben = pd.read_csv('benefitprograms.csv') + + # Rename specified variables + renames = { + 'IFDEPT': 'DSI', + 'TAXYEAR': 'FLPDYR', + 'XXTOT': 'XTOT', + 'JCPS21': 'e00200p', + 'JCPS31': 'e00200s', + 'ALIMONY': 'e00800', + 'JCPS25': 'e00900p', + 'JCPS35': 'e00900s', + 'JCPS28': 'e02100p', + 'JCPS38': 'e02100s', + 'UCOMP': 'e02300', + 'SOCSEC': 'e02400', + 'SEHEALTH': 'e03270', + 'DPAD': 'e03240', + 'MEDICALEXP': 'e17500', + 'REALEST': 'e18500', + 'MISCITEM': 'e20400', + 'CCE': 'e32800', + 'ICPS01': 'age_head', + 'ICPS02': 'age_spouse', + 'WT': 's006', + 'FILST': 'filer', + 'SEQUENCE': 'RECID', + 'PENSIONS': 'e01500', + 'DBE': 'e00600', + 'KEOGH': 'e03300', + 'TIRAD': 'e01400', + 'NU18': 'nu18', + 'N1821': 'n1821', + 'N21': 'n21', + 'CGAGIX': 'e01100', + 'BLIND_HEAD': 'blind_head', + 'BLIND_SPOUSE': 'blind_spouse', + 'HMIE': 'e19200', + # 'SSI': 'ssi_ben', + # 'VB': 'vet_ben', + # 'MEDICARE': 'mcare_ben', + # 'MEDICAID': 'mcaid_ben', + # 'SS': 'ss_ben', + # 'SNAP': 'snap_ben', + 'SLTX': 'e18400' + } + data = data.rename(columns=renames) + data['MARS'] = np.where(data.JS == 3, 4, data.JS) + + # Use taxpayer and spouse records to get total tax unit earnings and AGI + data['e00100'] = data['JCPS9'] + data['JCPS19'] + data['e00200'] = data['e00200p'] + data['e00200s'] + data['e00900'] = data['e00900p'] + data['e00900s'] + data['e02100'] = data['e02100p'] + data['e02100s'] + # Determine amount of qualified dividends using IRS ratio + data['e00650'] = data.e00600 * 0.7556 + + # Split interest income into taxable and tax exempt using IRS ratio + taxable = 0.6 + nontaxable = 1. - taxable + data['e00300'] = data.INTST * taxable + data['e00400'] = data.INTST * nontaxable + + # Split pentions and annuities using PUF ratio + data['e01700'] = data['e01500'] * 0.1656 + + print 'Applying deduction limits' + data = deduction_limits(data) + print 'Adding dependents' + data = add_dependents(data) + print 'Adding AGI bins' + data = add_agi_bin(data, 'INCOME') + print 'Adjusting distribution' + data = adjust(data, adj_targets) + # print 'Adding Benefits Data' + # data = benefits(data, other_ben) + print 'Dropping unused variables' + data = drop_vars(data) + + data = data.fillna(0.) + print 'Exporting...' + data.to_csv('cps.csv', index=False) + subprocess.check_call(["gzip", "-n", "cps.csv"]) + + +def deduction_limits(data): + """ + Apply limits on itemized deductions + """ + half_agi = data['e00100'] * 0.5 + charity = np.where(data.CHARITABLE > half_agi, half_agi, data.CHARITABLE) + # Split charitable contributions into cash and non-cash using ratio in PUF + cash = 0.82013 + non_cash = 1. - cash + data['e19800'] = charity * cash + data['e20100'] = charity * non_cash + + # Apply student loan interest deduction limit + data['e03210'] = np.where(data.SLINT > 2500, 2500, data.SLINT) + + # Apply IRA contribution limit + deductable_ira = np.where(data.AGE >= 50, + np.where(data.ADJIRA > 6500, 6500, data.ADJIRA), + np.where(data.ADJIRA > 5500, 5500, data.ADJIRA)) + data['e03150'] = deductable_ira + + return data + + +def add_dependents(data): + # Count number of dependents under 13 + # Max of four to match PUF version of nu13 + age1 = np.where((data.ICPS03 > 0) & (data.ICPS03 <= 13), 1, 0) + age2 = np.where((data.ICPS04 > 0) & (data.ICPS04 <= 13), 1, 0) + age3 = np.where((data.ICPS05 > 0) & (data.ICPS05 <= 13), 1, 0) + age4 = np.where((data.ICPS06 > 0) & (data.ICPS06 <= 13), 1, 0) + nu13 = age1 + age2 + age3 + age4 + data['nu13'] = nu13 + + # Count number of dependents under 5 + age1 = np.where((data.ICPS03 > 0) & (data.ICPS03 <= 5), 1, 0) + age2 = np.where((data.ICPS04 > 0) & (data.ICPS04 <= 5), 1, 0) + age3 = np.where((data.ICPS05 > 0) & (data.ICPS05 <= 5), 1, 0) + age4 = np.where((data.ICPS06 > 0) & (data.ICPS06 <= 5), 1, 0) + age5 = np.where((data.ICPS07 > 0) & (data.ICPS06 <= 5), 1, 0) + nu05 = age1 + age2 + age3 + age4 + age5 + data['nu05'] = nu05 + + # Count number of children eligible for child tax credit + # Max of three to mach PUF version of n24 + age1 = np.where((data.ICPS03 > 0) & (data.ICPS03 <= 17), 1, 0) + age2 = np.where((data.ICPS04 > 0) & (data.ICPS04 <= 17), 1, 0) + age3 = np.where((data.ICPS05 > 0) & (data.ICPS05 <= 17), 1, 0) + age4 = np.where((data.ICPS06 > 0) & (data.ICPS06 <= 17), 1, 0) + age5 = np.where((data.ICPS07) > 0 & (data.ICPS07 <= 17), 1, 0) + n24 = age1 + age2 + age3 + age4 + age5 + n24 = np.where(n24 > 3, 3, n24) + data['n24'] = n24 + + # Count number of elderly dependents + age1 = np.where(data.ICPS03 >= 65, 1, 0) + age2 = np.where(data.ICPS04 >= 65, 1, 0) + age3 = np.where(data.ICPS05 >= 65, 1, 0) + age4 = np.where(data.ICPS06 >= 65, 1, 0) + age5 = np.where(data.ICPS07 >= 65, 1, 0) + elderly = age1 + age2 + age3 + age4 + age5 + data['elderly_dependent'] = elderly + + # Count number elegible for f2441 + age1 = np.where((data.ICPS03 > 0) & (data.ICPS03 < 13), 1, 0) + age2 = np.where((data.ICPS04 > 0) & (data.ICPS04 < 13), 1, 0) + age3 = np.where((data.ICPS05 > 0) & (data.ICPS05 < 13), 1, 0) + age4 = np.where((data.ICPS06 > 0) & (data.ICPS06 < 13), 1, 0) + age5 = np.where((data.ICPS07 > 0) & (data.ICPS07 < 13), 1, 0) + qualified = age1 + age2 + age3 + age4 + age5 + data['f2441'] = np.where(qualified <= 3, qualified, 3) + + # Count number elegible for EIC + age1 = np.where((data.ICPS03 > 0) & (data.ICPS03 < 19), 1, 0) + age2 = np.where((data.ICPS04 > 0) & (data.ICPS04 < 19), 1, 0) + age3 = np.where((data.ICPS05 > 0) & (data.ICPS05 < 19), 1, 0) + age4 = np.where((data.ICPS06 > 0) & (data.ICPS06 < 19), 1, 0) + age5 = np.where((data.ICPS07 > 0) & (data.ICPS07 < 19), 1, 0) + qualified = age1 + age2 + age3 + age4 + age5 + data['EIC'] = np.where(qualified > 3, 3, qualified) + + return data + + +def drop_vars(data): + """ + Returns PDF of data without unuseable variables + """ + useable_vars = [ + 'DSI', 'EIC', 'FLPDYR', 'MARS', 'MIDR', 'RECID', 'XTOT', 'age_head', + 'age_spouse', 'agi_bin', 'blind_head', 'blind_spouse', 'cmbtp', + 'e00200', 'e00200p', 'e00200s', 'e00300', 'e00400', 'e00600', 'e00650', + 'e00700', 'e00800', 'e00900', 'e00900p', 'e00900s', 'e01100', 'e01200', + 'e01400', 'e01500', 'e01700', 'e02000', 'e02100', 'e02100p', 'e02100s', + 'e02300', 'e02400', 'e03150', 'e03220', 'e03230', 'e03240', 'e03270', + 'e03290', 'e03300', 'e03400', 'e03500', 'e07240', 'e07260', 'e07300', + 'e07400', 'e07600', 'e09700', 'e09800', 'e09900', 'e11200', 'e17500', + 'e18400', 'e18500', 'e19200', 'e19800', 'e20100', 'e20400', 'g20500', + 'e24515', 'e24518', 'e26270', 'e27200', 'e32800', 'e58990', 'e62900', + 'e87530', 'elderly_dependent', 'f2441', 'f6251', 'filer', 'n24', + 'nu05', 'nu13', 'nu18', 'n1821', 'n21', 'p08000', 'p22250', 'p23250', + 'p25470', 'p87521', 's006', 'e03210', 'ssi_ben', 'snap_ben', + 'vet_ben', 'mcare_ben', 'mcaid_ben', 'ss_ben', 'other_ben', 'total_ben' + ] + # for i in range(1, 16): + # useable_vars.append('SSI_VAL{}'.format(str(i))) + # useable_vars.append('SSI_PROB{}'.format(str(i))) + drop_vars = [] + for item in data.columns: + if item not in useable_vars: + drop_vars.append(item) + data = data.drop(drop_vars, axis=1) + return data + + +def add_agi_bin(data, col_name): + """ + Add an AGI bin indicator used in Tax-Calc to apply adjustment factors + """ + agi = pd.Series([0] * len(data[col_name])) + agi[data[col_name] < 0] = 0 + agi[(data[col_name] >= 0) & (data[col_name] < 5000)] = 1 + agi[(data[col_name] >= 5000) & (data[col_name] < 10000)] = 2 + agi[(data[col_name] >= 10000) & (data[col_name] < 15000)] = 3 + agi[(data[col_name] >= 15000) & (data[col_name] < 20000)] = 4 + agi[(data[col_name] >= 20000) & (data[col_name] < 25000)] = 5 + agi[(data[col_name] >= 25000) & (data[col_name] < 30000)] = 6 + agi[(data[col_name] >= 30000) & (data[col_name] < 40000)] = 7 + agi[(data[col_name] >= 40000) & (data[col_name] < 50000)] = 8 + agi[(data[col_name] >= 50000) & (data[col_name] < 75000)] = 9 + agi[(data[col_name] >= 75000) & (data.INCOME < 100000)] = 10 + agi[(data[col_name] >= 100000) & (data[col_name] < 200000)] = 11 + agi[(data[col_name] >= 200000) & (data[col_name] < 500000)] = 12 + agi[(data[col_name] >= 500000) & (data[col_name] < 1e6)] = 13 + agi[(data[col_name] >= 1e6) & (data[col_name] < 1.5e6)] = 14 + agi[(data[col_name] >= 1.5e6) & (data[col_name] < 2e6)] = 15 + agi[(data[col_name] >= 2e6) & (data[col_name] < 5e6)] = 16 + agi[(data[col_name] >= 5e6) & (data[col_name] < 1e7)] = 17 + agi[(data[col_name] >= 1e7)] = 18 + + data['agi_bin'] = agi + + return data + + +def adjust_helper(agi, var, target, weight, agi_bin): + """ + Parameters + ---------- + agi: AGI provided in the CPS + var: variable being adjusted + target: target bin levels + weight: weights + + Returns + ------- + Series containing the adjusted values of the variable + """ + # Goal total ensures the weighted sum of the variable wont change + goal_total = (var * weight).sum() + # Goal distribution based on IRS data + distribution = target / target.sum() + # Find the goal amount in each bin + goal_amts = goal_total * distribution + # Find current totals in each bin + bin_0 = np.where(agi < 0, + var * weight, 0).sum() + bin_1 = np.where((agi >= 0) & (agi < 5000), + var * weight, 0).sum() + bin_2 = np.where((agi >= 5000) & (agi < 10000), + var * weight, 0).sum() + bin_3 = np.where((agi >= 10000) & (agi < 15000), + var * weight, 0).sum() + bin_4 = np.where((agi >= 15000) & (agi < 20000), + var * weight, 0).sum() + bin_5 = np.where((agi >= 20000) & (agi < 25000), + var * weight, 0).sum() + bin_6 = np.where((agi >= 25000) & (agi < 30000), + var * weight, 0).sum() + bin_7 = np.where((agi >= 30000) & (agi < 40000), + var * weight, 0).sum() + bin_8 = np.where((agi >= 40000) & (agi < 50000), + var * weight, 0).sum() + bin_9 = np.where((agi >= 50000) & (agi < 75000), + var * weight, 0).sum() + bin_10 = np.where((agi >= 75000) & (agi < 100000), + var * weight, 0).sum() + bin_11 = np.where((agi >= 100000) & (agi < 200000), + var * weight, 0).sum() + bin_12 = np.where((agi >= 200000) & (agi < 500000), + var * weight, 0).sum() + bin_13 = np.where((agi >= 500000) & (agi < 1e6), + var * weight, 0).sum() + bin_14 = np.where((agi >= 1e6) & (agi < 1.5e6), + var * weight, 0).sum() + bin_15 = np.where((agi >= 1.5e6) & (agi < 2e6), + var * weight, 0).sum() + bin_16 = np.where((agi >= 2e6) & (agi < 5e6), + var * weight, 0).sum() + bin_17 = np.where((agi >= 5e6) & (agi < 1e7), + var * weight, 0).sum() + bin_18 = np.where((agi >= 1e7), + var * weight, 0).sum() + # Create series holding each of the current totals + actual_amts = pd.Series([bin_0, bin_1, bin_2, bin_3, bin_4, bin_5, + bin_6, bin_7, bin_8, bin_9, bin_10, bin_11, + bin_12, bin_13, bin_14, bin_15, bin_16, + bin_17, bin_18], + index=goal_amts.index) + ratios_index = [num for num in range(0, 19)] + # Determine the ratios + ratios = pd.Series(goal_amts / actual_amts) + ratios.index = ratios_index + + # Apply adjustment ratios + var_array = np.array(var) + var_array = np.nan_to_num(var_array) + ratios = np.where(ratios == np.inf, 1., ratios) + adj_array = ratios[agi_bin] + var *= adj_array + + return var + + +def adjust(data, targets): + """ + data: CPS in DataFrame format + targets: targeted totals provided by the IRS + """ + # Make copies of values to avoid pandas warning + inc = copy.deepcopy(data['INCOME']) + int_inc = copy.deepcopy(data['e00300']) + odiv_inc = copy.deepcopy(data['e00600']) + qdiv_inc = copy.deepcopy(data['e00650']) + biz_inc = copy.deepcopy(data['e00900']) + data['e00300'] = adjust_helper(inc, int_inc, + targets['INT'], data['s006'], + data['agi_bin']) + div_ratio = data['e00600'] / (data['e00600'] + data['e00650']) + data['e00600'] = adjust_helper(inc, odiv_inc, + targets['ODIV'], data['s006'], + data['agi_bin']) + data['e00650'] = adjust_helper(inc, qdiv_inc, + targets['QDIV'], data['s006'], + data['agi_bin']) + total = data['e00600'] + data['e00650'] + data['e00600'] = total * div_ratio + data['e00650'] = total * (1. - div_ratio) + biz_ratio_p = data['e00900p'] / data['e00900'] + biz_ratio_s = 1. - biz_ratio_p + data['e00900'] = adjust_helper(inc, biz_inc, + targets['BIZ'], data['s006'], + data['agi_bin']) + data['e00900p'] = data['e00900'] * biz_ratio_p + data['e00900s'] = data['e00900'] * biz_ratio_s + + return data + + +def benefits(data, other_ben): + """ + Distribute benefits from non-models benefit programs and create total + benefits variable + """ + # Distribute other benefits + data['dist_ben'] = (data['mcaid_ben'] + data['ssi_ben'] + + data['snap_ben'] + data['vet_ben']) + data['ratio'] = (data['dist_ben'] * data['s006'] / + (data['dist_ben'] + data['s006']).sum()) + data['other_ben'] = data['ratio'] * other_ben['Cost'].sum() / data['s006'] + data['total_ben'] = (data['mcaid_ben'] + data['mcare_ben'] + + data['ssi_ben'] + data['snap_ben'] + data['ss_ben'] + + data['vet_ben'] + data['other_ben']) + return data + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/cps_stage2/README.md b/cps_stage2/README.md index 51299775..54e518c7 100644 --- a/cps_stage2/README.md +++ b/cps_stage2/README.md @@ -3,16 +3,19 @@ About cps_stage2 This directory contains the following script: -* Python script **....py**, which reads/writes: +* Python script `finalprep.py`, which reads/writes: Input files: - - ... + - `cps_weights_raw.csv.gz` Output files: - - ... + - `cps_weights.csv.gz` Documentation ------------- -**all documentation files go in a `doc` subdirectory of this directory** +`cps_weights_raw.csv.gz` was provided to us by John O'Hare of +[Quantria Strategies](http://www.quantria.com). `finalprep.py` +reads in this file, multiplies each record by 100, and changes each weight from +a floating point to an integer in order to reduce file size. diff --git a/cps_stage2/cps_weights.csv.gz b/cps_stage2/cps_weights.csv.gz new file mode 100644 index 00000000..9d8bb767 Binary files /dev/null and b/cps_stage2/cps_weights.csv.gz differ diff --git a/cps_stage2/cps_weights_raw.csv.gz b/cps_stage2/cps_weights_raw.csv.gz new file mode 100644 index 00000000..419994ea Binary files /dev/null and b/cps_stage2/cps_weights_raw.csv.gz differ diff --git a/cps_stage2/finalprep.py b/cps_stage2/finalprep.py new file mode 100644 index 00000000..02c5d339 --- /dev/null +++ b/cps_stage2/finalprep.py @@ -0,0 +1,9 @@ +import pandas as pd +import subprocess + + +weights = pd.read_csv('cps_weights_raw.csv.gz', compression='gzip') +weights *= 100. +weights = weights.round(0).astype('int64') +weights.to_csv('cps_weights.csv', index=False) +subprocess.check_call(['gzip', '-n', 'cps_weights.csv'])