-
Notifications
You must be signed in to change notification settings - Fork 0
/
recode.py
executable file
·126 lines (113 loc) · 5.73 KB
/
recode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#As a work of the United States government, this project is in the public
#domain within the United States. Additionally, we waive copyright and related
#rights in the work worldwide through the CC0 1.0 Universal public domain
#dedication (https://creativecommons.org/publicdomain/zero/1.0/)
''' Convert Recon/Reid format CEF data back into format needed for suppression metrics '''
import os
import re
import pandas as pd
from itertools import product
from operator import itemgetter
from multiprocessing import Pool
#DTYPES for rhdf file
DTYPES = {'geoid_block': 'str',
'age': 'int',
'white': 'str',
'black': 'str',
'aian': 'str',
'asian': 'str',
'nhopi': 'str',
'sor': 'str',
'hisp': 'str'}
#Dictionary of race code sets
RACE_DICT = {
'WHITE': {'01', '07', '08', '09', '10', '11', '22', '23', '24', '25', '26',
'27', '28', '29', '30', '31', '42', '43', '44', '45', '46', '47', '48',
'49', '50', '51', '57', '58', '59', '60', '61', '63'},
'BLACK': {'02', '07', '12', '13', '14', '15', '22', '23', '24', '25', '32',
'33', '34', '35', '36', '37', '42', '43', '44', '45', '46', '47', '52',
'53', '54', '55', '57', '58', '59', '60', '62', '63'},
'AIAN': {'03', '08', '12', '16', '17', '18', '22', '26', '27', '28', '32', '33',
'34', '38', '39', '40', '42', '43', '44', '48', '49', '50', '52', '53',
'54', '56', '57', '58', '59', '61', '62', '63'},
'ASIAN': {'04', '09', '13', '16', '19', '20', '23', '26', '29', '30', '32',
'35', '36', '38', '39', '41', '42', '45', '46', '48', '49', '51', '52',
'53', '55', '56', '57', '58', '60', '61', '62', '63'},
'NHOPI': {'05', '10', '14', '17', '19', '21', '24', '27', '29', '31', '33',
'35', '37', '38', '40', '41', '43', '45', '47', '48', '50', '51', '52',
'54', '55', '56', '57', '59', '60', '61', '62', '63'},
'SOR': {'06', '11', '15', '18', '20', '21', '25', '28', '30', '31', '34', '36',
'37', '39', '40', '41', '44', '46', '47', '49', '50', '51', '53', '54',
'55', '56', '58', '59', '60', '61', '62', '63'}
}
RACE_ORDER = ['WHITE', 'BLACK', 'AIAN', 'ASIAN', 'NHOPI', 'SOR']
def make_cenrace(race_dict, race_order):
''' function to define map of rhdf race categories back into cenrace values
e.g. YNNNNN -> 01 #white only
returns: a dictionary of form {'XXXXXX': 'YY', ...} giving the mapping
'''
ret = {}
possibles = [''.join(x) for x in list(product('YN', repeat=6))]
for possible in possibles:
#location of yesses
Ylocs = [pos for pos, char in enumerate(possible) if char=='Y']
#location of noes
Nlocs = [pos for pos, char in enumerate(possible) if char=='N']
Yraces = [race_order[x] for x in Ylocs]
Nraces = [race_order[x] for x in Nlocs]
Yintersection = set()
Nunion = set()
if(len(Yraces) > 0):
Ysets = itemgetter(*Yraces)(race_dict)
Yintersection = Ysets
if(len(Yraces) > 1):
Yintersection = set.intersection(*Ysets)
if(len(Nraces) > 0):
Nsets = itemgetter(*Nraces)(race_dict)
Nunion = Nsets
if(len(Nraces) > 1):
Nunion = set.union(*Nsets)
#Category we need must satisfy all the Y values and none of the N values
rcat = list(Yintersection - Nunion)
if(len(rcat) > 0):
ret[possible] = int(rcat[0])
return(ret)
RACE_MAP = make_cenrace(RACE_DICT, RACE_ORDER)
RCAT_MAP = {1: 1, 2: 2, 7: 2, 3: 3, 8: 3, 4: 4, 9: 4, 5: 5, 10: 5, 6: 6, 11: 6}
RCAT_MAP.update(dict.fromkeys(list(range(12, 64)), 7))
AGE18_MAP = {x: int(x >= 18) for x in range(0, 116)}
def process_file(rhdf_path, outname, isCEF=False):
print(f'Reading {rhdf_path}')
rHDF = pd.read_csv(rhdf_path, dtype=DTYPES, usecols=DTYPES.keys())
#column manipulation for r&r cef file
if(isCEF):
rHDF.white.replace({'0': 'N', '1': 'Y'}, inplace=True)
rHDF.black.replace({'0': 'N', '1': 'Y'}, inplace=True)
rHDF.aian.replace({'0': 'N', '1': 'Y'}, inplace=True)
rHDF.asian.replace({'0': 'N', '1': 'Y'}, inplace=True)
rHDF.nhopi.replace({'0': 'N', '1': 'Y'}, inplace=True)
rHDF.sor.replace({'0': 'N', '1': 'Y'}, inplace=True)
rHDF.hisp.replace({'0': 'N', '1': 'Y'}, inplace=True)
rHDF['allrace'] = rHDF['white'] + rHDF['black'] + rHDF['aian'] + rHDF['asian'] + rHDF['nhopi'] + rHDF['sor']
rHDF['CENRACE'] = rHDF.allrace.replace(RACE_MAP)
rHDF['race'] = rHDF.CENRACE.replace(RCAT_MAP)
rHDF['state'] = rHDF.geoid_block.str[:2]
rHDF['county'] = rHDF.geoid_block.str[2:5]
rHDF['tract'] = rHDF.geoid_block.str[5:11]
rHDF['blkgrp'] = rHDF.geoid_block.str[11:12]
rHDF['block'] = rHDF.geoid_block.str[11:]
rHDF['age18'] = rHDF.age.replace(AGE18_MAP)
rHDF['his'] = rHDF.hisp.replace({'N': '0', 'Y': '1'})
rHDF['nation'] = 1
OUTCOLS = ['race', 'nation', 'state', 'county', 'tract', 'blkgrp', 'block', 'age18', 'his']
if outname is not None: rHDF[OUTCOLS].to_csv(outname, index=False)
return(rHDF[OUTCOLS])
def mapper(x):
return(process_file(x, None, True))
if __name__ == '__main__':
cefdir = '../data/reid_module/cef/'
cef_files = [cefdir + f for f in os.listdir(cefdir) if re.search(r'cef\d{5}[.]csv', f) if not 'cef72' in f]
p = Pool(25)
ceflist = p.map(mapper, cef_files)
print('Concatenating and writing full CEF file')
pd.concat(ceflist).to_csv('pcef.csv', index=False)