-
Notifications
You must be signed in to change notification settings - Fork 0
/
append_preproc_TSLA_data.py
159 lines (127 loc) · 7.51 KB
/
append_preproc_TSLA_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 27 13:58:14 2019
Append preprocessed TSLA data (HDF file) to existing TSLA dataset.
The data to append should already be filtered.
@author: David Loibl
"""
import pandas as pd
import numpy as np
import os
append_file = "<FILE_TO_APPEND>.h5"
existing_file = "<FILE_TO_APPEND_NEW_DATA_TO>.h5"
output_file = "<OUTPUT_FILE>.h5"
remove_exisitng_output_file = 1
filter_again = False
print('Reading input file. This may take a while ...')
df_TSL_new = pd.read_hdf(append_file, 'TSLs', parse_dates=True, low_memory=False)
print('Success. Imported file contains '+ str(df_TSL_new.shape[0]) +' rows and ' + str(df_TSL_new.shape[1]) +' columns')
# df_TSL_new.LS_DATE = pd.to_datetime(df_TSL_new.LS_DATE, format='%Y-%m-%d')
# df_TSL_new.index = pd.to_datetime(df_TSL_new.LS_DATE, format='%Y-%m-%d')
# Normalize TSLs for individual glaciers to their respective elevation ranges
if 'TSL_stddev_norm' not in df_TSL_new.columns:
df_TSL_new['TSL_stddev_norm'] = (df_TSL_new.SC_stdev - df_TSL_new.glacier_DEM_min) / (df_TSL_new.glacier_DEM_max - df_TSL_new.glacier_DEM_min)
if 'TSL_max_norm' not in df_TSL_new.columns:
df_TSL_new['TSL_max_norm'] = (df_TSL_new.SC_max - df_TSL_new.glacier_DEM_min) / (df_TSL_new.glacier_DEM_max - df_TSL_new.glacier_DEM_min)
if 'TSL_min_norm' not in df_TSL_new.columns:
df_TSL_new['TSL_min_norm'] = (df_TSL_new.SC_min - df_TSL_new.glacier_DEM_min) / (df_TSL_new.glacier_DEM_max - df_TSL_new.glacier_DEM_min)
if 'TSL_normalized' not in df_TSL_new.columns:
df_TSL_new['TSL_normalized'] = (df_TSL_new.SC_median - df_TSL_new.glacier_DEM_min) / (df_TSL_new.glacier_DEM_max - df_TSL_new.glacier_DEM_min)
if filter_again:
# Remove flawed measurements based on intrinsic cirteria
print('Removing flawed measurements based on intrinsic cirteria ...')
thresholds = {
'CC_total': 70,
'CC_TSLrg': 40,
'class_cv': 40,
'max_stdd': 0.2,
'TSL_rnge': 0.005,
'DICa_min': 0.01,
'DICa_gla': 0.02,
'SCmin_of': 50,
'class_cc_cv': 60,
}
#first set class coverage > 100 to 100
df_TSL_new.class_coverage.values[df_TSL_new.class_coverage.values > 100] = 100
OK = np.where((df_TSL_new.CC_total_port < thresholds['CC_total']) &
(df_TSL_new.cc_TSLrange_percent < thresholds['CC_TSLrg']) &
(df_TSL_new.class_coverage > thresholds['class_cv']) &
(df_TSL_new.SC_stdev < (thresholds['max_stdd'] * (df_TSL_new.glacier_DEM_max - df_TSL_new.glacier_DEM_min))) &
((df_TSL_new.SC_max - df_TSL_new.SC_min) > thresholds['TSL_rnge'] * (df_TSL_new.glacier_DEM_max - df_TSL_new.glacier_DEM_min)) &
((df_TSL_new.DIC_area > thresholds['DICa_min']) | (df_TSL_new.SC_min < (df_TSL_new.glacier_DEM_min + thresholds['SCmin_of']))) &
((df_TSL_new.DIC_area > (thresholds['DICa_gla'] * df_TSL_new.glacier_area)) | (df_TSL_new.SC_min < (df_TSL_new.glacier_DEM_min + thresholds['SCmin_of']))) &
((100-df_TSL_new.class_coverage+df_TSL_new.CC_total_port) < thresholds['class_cc_cv'])
)[0]
failed_measurements = len(df_TSL_new) - len(OK)
print('Total number of measurements: ' + str(len(df_TSL_new)))
print('Number of measurements that failed checks: ' + str(failed_measurements))
df_TSL_new = df_TSL_new.iloc[OK,:]
unique_glaciers = df_TSL_new['RGI_ID'].unique()
n_glaciers = len(unique_glaciers)
print('Data set contains '+ str(n_glaciers) +' unique glaciers.')
if os.path.isfile(output_file):
print('')
if remove_exisitng_output_file == 1:
print('Output file exists and will be deleted')
os.remove(output_file)
else:
print('WARNING: Output file exists but should not be deleted.\nData will be appended which may lead to lots of dublettes!')
df_TSL_new.LS_DATE = pd.to_datetime(df_TSL_new.LS_DATE, format='%Y-%m-%d')
exists = os.path.isfile(existing_file)
if not exists:
# Use existing glacier ID file to determine glaciers to process ...
print('No input file found at '+ str(existing_file) +' - skipping')
else:
n_runs = 0
for glacier_ID in unique_glaciers:
#print('Appending data for glacier with RGI-ID '+ str(glacier_ID) +' ...')
#existing_file = glacier_TSL_path +'/'+ glacier_ID +'.h5'
df_TSL_existing_add = df_TSL_new[df_TSL_new.RGI_ID == glacier_ID]
# print('Found '+ str(df_TSL_existing_add.shape[0]) +' new observations for '+ str(glacier_ID) +' ...')
# print('Reading glacier file')
df_TSL_existing = pd.read_hdf(existing_file, where=['RGI_ID == glacier_ID'], parse_dates=True, low_memory=False)
# Remove not matchting columns in first run
if n_runs == 0:
drop_cols = list(set(list(df_TSL_new.columns)).difference(list(df_TSL_existing.columns)))
df_TSL_new.drop(drop_cols, axis=1, inplace=True)
df_result = pd.DataFrame(columns=list(df_TSL_existing.columns))
print('Setting up result DataFrame')
df_TSL_existing.index = pd.to_datetime(df_TSL_existing.LS_DATE, format='%Y-%m-%d')
df_TSL_existing.LS_DATE = pd.to_datetime(df_TSL_existing.LS_DATE, format='%Y-%m-%d')
unique_LS_dates_base = df_TSL_existing.LS_DATE.unique()
unique_LS_dates_add = df_TSL_existing_add.LS_DATE.unique()
non_duplicate_dates = np.invert(np.isin(unique_LS_dates_add, unique_LS_dates_base))
df_TSL_existing_add_non_duplicate = df_TSL_existing_add.loc[non_duplicate_dates,:]
df_TSL_existing.drop(['year', 'month'], axis=1, inplace=True)
df_result = df_TSL_existing.append(df_TSL_existing_add_non_duplicate, sort=True)
# df_update.reset_index(drop=True, inplace=True)
# df_result = df_result.append(df_update, ignore_index=True)
# Ensure Integer cols are detected corretly to avoid export error
int_cols = ['glacier_DEM_min', 'glacier_DEM_max', 'scene']
for int_col in int_cols:
if df_result[int_col].dtype != np.int64:
df_result = df_result.astype({int_col: int})
drop_cols = ['year', 'month', 'TSL_max_norm', 'TSL_min_norm']
for drop_col in drop_cols:
if drop_col in df_result.columns:
df_result.drop(columns=[drop_col], inplace=True)
df_result.reset_index(drop=True, inplace=True)
progress = (n_runs + 1) / n_glaciers * 100
print('Processing '+ str(glacier_ID) +' - adding '+ str(df_TSL_existing_add_non_duplicate.shape[0]) +' new rows ['+ str(n_runs) +' of '+ str(n_glaciers) +' - '+ str(round(progress, 4)) +' %]')
df_result.to_hdf(output_file, key='TSLAs', mode='a', format='table', append=True, data_columns=['RGI_ID'])
n_runs += 1
# if n_runs == 10:
# break
# # Ensure Integer cols are detected corretly to avoid export error
# int_cols = ['glacier_DEM_min', 'glacier_DEM_max', 'scene']
# for int_col in int_cols:
# if df_result[int_col].dtype != np.int64:
# df_result = df_result.astype({int_col: int})
# drop_cols = ['year', 'month', 'TSL_max_norm', 'TSL_min_norm']
# for drop_col in drop_cols:
# if drop_col in df_result.columns:
# df_result.drop(columns=[drop_col], inplace=True);
#print('\nWriting result to file ...')
#df_result.to_hdf(output_file, key='TSLAs', mode='a', format='table', data_columns=['RGI_ID'])
print('\nProcessing finished.\n')