Skip to content

Commit

Permalink
Merge pull request #35 from statisticsnorway/playground
Browse files Browse the repository at this point in the history
Playground
  • Loading branch information
jep739 authored Oct 3, 2024
2 parents 74dec4a + 59fa5ae commit 5bff5c2
Show file tree
Hide file tree
Showing 8 changed files with 1,899 additions and 1,460 deletions.
2,020 changes: 1,039 additions & 981 deletions poetry.lock

Large diffs are not rendered by default.

53 changes: 40 additions & 13 deletions src/extra/nni tester/nni.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,26 +323,53 @@ def monte_carlo_simulation(data, iterations=number_iterations):

print(f"Average Mean Absolute Error: {avg_mae}")
print(f"Average R-squared: {avg_r_squared}")
print(f"Average Percentage Difference: {avg_percentage_diff}%")
# print(f"Average Percentage Difference: {avg_percentage_diff}%")

# Optionally, plot the distribution of MAE across simulations
mae_values = [result['MAE'] for result in summary_stats]
plt.hist(mae_values, bins=10, alpha=0.75)
plt.title('Distribution of MAE across Simulations')
plt.xlabel('Mean Absolute Error')
plt.ylabel('Frequency')
plt.show()
# # Optionally, plot the distribution of MAE across simulations
# mae_values = [result['MAE'] for result in summary_stats]
# plt.hist(mae_values, bins=10, alpha=0.75)
# plt.title('Distribution of MAE across Simulations')
# plt.xlabel('Mean Absolute Error')
# plt.ylabel('Frequency')
# plt.show()

# Adding Residual Plot
# # Adding Residual Plot
results_df['residuals'] = results_df['salgsint'] - results_df['predicted_salgsint']

# plt.figure(figsize=(10, 6))
# plt.scatter(results_df['predicted_salgsint'], results_df['residuals'], alpha=0.5)
# plt.axhline(y=0, color='k', linestyle='--', lw=2)
# plt.title('Residual Plot')
# plt.xlabel('Predicted Sales')
# plt.ylabel('Residuals (Actual - Predicted)')
# plt.show()

import seaborn as sns
mae_values = [result['MAE'] for result in summary_stats]
# Set a color palette
sns.set_palette("coolwarm", n_colors=10)

# Histogram of MAE with custom color and styling
plt.figure(figsize=(10, 6))
plt.hist(mae_values, bins=10, alpha=0.75, color=sns.color_palette()[1], edgecolor='black')
plt.title('Distribution of MAE across Simulations', fontsize=16)
plt.xlabel('Mean Absolute Error', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(results_df['predicted_salgsint'], results_df['residuals'], alpha=0.5)
plt.scatter(results_df['predicted_salgsint'], results_df['residuals'], alpha=0.5,
c=results_df['residuals'], cmap='coolwarm', edgecolor='black', s=50)
plt.axhline(y=0, color='k', linestyle='--', lw=2)
plt.title('Residual Plot')
plt.xlabel('Predicted Sales')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residual Plot', fontsize=16)
plt.xlabel('Predicted Sales', fontsize=12)
plt.ylabel('Residuals (Actual - Predicted)', fontsize=12)
plt.colorbar(label='Residuals')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()




def evaluate_varehandel(current_year, start_year):
Expand Down
173 changes: 160 additions & 13 deletions src/functions/create_datafiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,15 @@
import kommune_translate


def main(year, limit, skjema):
start_year = 2017
def main(year, limit, skjema_nr, distribtion_percent):


# Assuming 'skjema' is a variable or a column in a DataFrame
if skjema_nr == 'RA-1100':
start_year = 2018
else:
start_year = 2017

all_good_dataframes = [] # List to store good dataframes for each year
all_bad_dataframes = [] # List to store bad dataframes for each year
all_training_dataframes = [] # List to store training dataframes for each year
Expand Down Expand Up @@ -71,7 +78,8 @@ def main(year, limit, skjema):
fjor = current_year - 1 # Previous year

# skjema_list = ['RA-0174-1', 'RA-0174A3', 'RA-0827A3']
skjema_list = 'RA-0174-1'
# skjema_list = 'RA-0174-1'
skjema_list = skjema_nr
fil_path = [
f
for f in fs.glob(
Expand Down Expand Up @@ -123,6 +131,12 @@ def main(year, limit, skjema):
"TMP_NO_OMSETN",
"TMP_DRIFTSKOSTNAD_9010",
"TMP_DRIFTSKOSTNAD_9910",
"TMP_OMS",
"NO_OMS",
"B_DRIFTSKOSTNADER",
"B_OMSETNING",
"TMP_B_SN07_1",
"REG_TYPE_BEDRIFT"
]

# Filter the DataFrame for the specified field values
Expand All @@ -138,6 +152,106 @@ def main(year, limit, skjema):
skjema = skjema.reset_index()
skjema.columns = skjema.columns.str.lower() # Convert column names to lower case

# Assuming 'skjema_list' is a variable, not a column in the DataFrame
if skjema_list == 'RA-1403':
# Rename the column 'tmp_oms' to 'tmp_no_omsetn'
skjema.rename(columns={'tmp_oms': 'tmp_no_omsetn'}, inplace=True)

if skjema_list == 'RA-0255-1':
# Rename the column 'tmp_oms' to 'tmp_no_omsetn'
skjema.rename(columns={'no_oms': 'tmp_no_omsetn'}, inplace=True)

# Get varekostnad data on foretak level
fil_path = [
f
for f in fs.glob(
f"gs://ssb-prod-noeku-data-produkt/statistikkfiler/g{current_year}/statistikkfil_foretak_pub.parquet"
)
if f.endswith(".parquet")
]

# Use the ParquetDataset to read multiple files
dataset = pq.ParquetDataset(fil_path, filesystem=fs)

table = dataset.read()

# Convert to Pandas DataFrame
foretak_pub = table.to_pandas()

# Check if current_year is 2022 or higher
if current_year >= 2023:
foretak_pub = foretak_pub[['nopost_p4005', 'enhets_id', 'nopost_driftskostnader']]
foretak_pub.rename(columns={'nopost_driftskostnader': 'tmp_driftskostnad_9010'}, inplace=True)
else:
foretak_pub = foretak_pub[['nopost_p4005', 'enhets_id']]


foretak_pub.rename(columns={'nopost_p4005': 'tmp_no_p4005', 'enhets_id': 'id'}, inplace=True)

skjema = pd.merge(skjema, foretak_pub, how='left', on='id')

# fill tmp_no_p4005 nan with 0
skjema['tmp_no_p4005'].fillna(0, inplace=True)

del foretak_pub, dataset, table

if skjema_list == 'RA-1100':

# Get data on foretak level
fil_path = [
f
for f in fs.glob(
f"gs://ssb-prod-noeku-data-produkt/statistikkfiler/g{current_year}/statistikkfil_foretak_pub.parquet"
)
if f.endswith(".parquet")
]

# Use the ParquetDataset to read multiple files
dataset = pq.ParquetDataset(fil_path, filesystem=fs)
table = dataset.read()

# Convert to Pandas DataFrame
foretak_pub = table.to_pandas()

foretak_pub = foretak_pub[['nopost_p4005', 'enhets_id', 'omsetning', 'naring_f']]

foretak_pub.rename(columns={'nopost_p4005': 'tmp_no_p4005', 'enhets_id': 'id', 'omsetning': 'tmp_no_omsetn', 'naring_f': 'nacef_5'}, inplace=True)

skjema = pd.merge(skjema, foretak_pub, how='left', on='id')

skjema.rename(columns={'b_omsetning': 'gjeldende_omsetn_kr', 'b_driftskostnader': 'driftskost_kr', 'tmp_b_sn07_1': 'tmp_sn2007_5', 'reg_type_bedrift': 'regtype'}, inplace=True)

skjema['gjeldende_bdr_syss'] = skjema['b_sysselsetting_syss']

# fill tmp_no_p4005 nan with 0
skjema['tmp_no_p4005'].fillna(0, inplace=True)

fil_path = [
f
for f in fs.glob(
f"gs://ssb-prod-noeku-data-produkt/statistikkfiler/g{fjor}/statistikkfil_bedrifter_pub.parquet"
)
if f.endswith(".parquet")
]

# Use the ParquetDataset to read multiple files
dataset = pq.ParquetDataset(fil_path, filesystem=fs)
table = dataset.read()

# Convert to Pandas DataFrame
bedrift_pub = table.to_pandas()

bedrift_pub = bedrift_pub[['sysselsetting_syss', 'enhets_id', 'omsetning']]

bedrift_pub.rename(columns={'sysselsetting_syss': 'fjor_syssel_t1', 'enhets_id': 'id', 'omsetning': 'fjor_omsetn_kr_t1'}, inplace=True)

skjema = pd.merge(skjema, bedrift_pub, how='left', on='id')

skjema['fjor_syssel_t1'].fillna(0, inplace=True)


del bedrift_pub, dataset, table

# Foretak level data is always when radnr = 0
foretak = skjema.loc[skjema["radnr"] == 0]

Expand Down Expand Up @@ -299,9 +413,15 @@ def main(year, limit, skjema):
].transform(lambda x: (x > 0).sum())

# Create 'bad_temp' DataFrame based on conditions
# bad_temp = good_temp_df[
# (good_temp_df["bedrift_count"] >= 2) & (good_temp_df["distribution_count"] < 2)
# ]

good_temp_df['distribution_rate'] = good_temp_df['distribution_count'] / good_temp_df['bedrift_count']

bad_temp = good_temp_df[
(good_temp_df["bedrift_count"] > 2) & (good_temp_df["distribution_count"] < 2)
]
(good_temp_df["bedrift_count"] >= 2) & (good_temp_df["distribution_rate"] <= distribtion_percent)
]

bad_temp['driftskost_kr'] = np.nan

Expand Down Expand Up @@ -405,6 +525,8 @@ def main(year, limit, skjema):
merged_df["emp_delta"] = merged_df["gjeldende_bdr_syss"] / merged_df["fjor_syssel_t1"]

imputable_df = merged_df.copy()

test3 = imputable_df.copy()


imputable_df = imputable_df.drop_duplicates(subset=["v_orgnr"])
Expand All @@ -431,12 +553,25 @@ def main(year, limit, skjema):
imputable_df["inntekt_delta"], errors="coerce"
)

general_inflation_rate = imputable_df.loc[
imputable_df["n4"] == "47.78", "inflation_rate"
].values[0]
imputable_df["inflation_rate"] = imputable_df["inflation_rate"].fillna(
general_inflation_rate
)
# general_inflation_rate = imputable_df.loc[
# imputable_df["n4"] == "47.78", "inflation_rate"
# ].values[0]


# Fetch the general inflation rate for the current year
general_inflation_rate = kpi.fetch_general_inflation_rate(current_year)

# If fetching for the current year fails, try fetching for the previous year
if general_inflation_rate is None:
general_inflation_rate = kpi.fetch_general_inflation_rate(current_year - 1)

# Fill missing inflation rate values with the fetched general inflation rate
imputable_df["inflation_rate"] = imputable_df["inflation_rate"].fillna(general_inflation_rate)


# imputable_df["inflation_rate"] = imputable_df["inflation_rate"].fillna(
# general_inflation_rate
# )

imputable_df["inflation_rate_oms"] = (
imputable_df["fjor_omsetn_kr_t1"] * imputable_df["inflation_rate"]
Expand Down Expand Up @@ -575,6 +710,8 @@ def main(year, limit, skjema):

imputable_df = pd.merge(imputable_df, knn_df, how="inner", on="v_orgnr")


# Leave on or off?
imputable_df_filtered = imputable_df[~imputable_df["regtype"].isin(["04", "11"])]


Expand Down Expand Up @@ -674,7 +811,7 @@ def main(year, limit, skjema):
current_year_good_oms = good_data[good_data['year'] == year]
current_year_bad_oms = bad_data[bad_data['year'] == year]
v_orgnr_list_for_imputering = current_year_bad_oms['v_orgnr'].tolist()

unique_id_list = current_year_bad_oms[current_year_bad_oms['nacef_5'].str.startswith('68')]['id'].unique().tolist()

# Easy solution for filling Nan Values - only for training, not for editing real data
training_data['tmp_sn2007_5'].fillna(training_data['nacef_5'], inplace=True)
Expand Down Expand Up @@ -743,10 +880,12 @@ def process_group(v_orgnr, group):
nan_counts = training_data.isna().sum()

# Print the result
print("Number of NaN values in training variables")
print(nan_counts)

training_data['gjeldende_bdr_syss'] = pd.to_numeric(training_data['gjeldende_bdr_syss'], errors='coerce')

# remove temp orgnr rows.
training_data = training_data[~training_data['v_orgnr'].isin(['111111111', '123456789'])]

training_data = kommune_translate.translate_kommune_kodes_2(training_data)
Expand Down Expand Up @@ -853,7 +992,15 @@ def process_group(v_orgnr, group):
merging_df = current_year_bad_oms[['v_orgnr', 'id', 'year', 'lopenr']]

imputatable_df = pd.merge(merging_df, temp, on=['v_orgnr', 'id', 'year'], how='left')

test4 = imputatable_df.copy()

training_data = training_data[~training_data['v_orgnr'].isin(v_orgnr_list_for_imputering)]

test5 = current_year_good_oms.copy()

test6 = current_year_bad_oms.copy()

test7 = training_data.copy()

return current_year_good_oms, current_year_bad_oms, v_orgnr_list_for_imputering, training_data, imputatable_df, time_series_df
return current_year_good_oms, current_year_bad_oms, v_orgnr_list_for_imputering, training_data, imputatable_df, time_series_df, unique_id_list
42 changes: 42 additions & 0 deletions src/functions/kpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,3 +472,45 @@ def process_kpi_data(year):
kpi_df.rename(columns={"value": "inflation_rate", "konsumgrp": "n4"}, inplace=True)

return kpi_df



def fetch_general_inflation_rate(year):
POST_URL = "https://data.ssb.no/api/v0/no/table/03013/"
bef_kom = {
"query": [
{
"code": "Konsumgrp",
"selection": {"filter": "vs:CoiCop2016niva1", "values": []},
},
{
"code": "ContentsCode",
"selection": {"filter": "item", "values": ["Tolvmanedersendring"]},
},
{
"code": "Tid",
"selection": {"filter": "item", "values": [str(year) + "M12"]},
},
],
"response": {"format": "json-stat2"},
}

# Behandler spørringen
resultat1 = requests.post(POST_URL, json=bef_kom)

# Check if request was successful
if resultat1.status_code == 200:
from pyjstat import pyjstat

# Convert JSON response to DataFrame
dataset1 = pyjstat.Dataset.read(resultat1.text)
df_temp = dataset1.write("dataframe")

# Extract value from the DataFrame (the inflation rate)
value = df_temp.iloc[0]["value"]

# Return the value as the general inflation rate
return value
else:
print("Failed to retrieve data:", resultat1.status_code)
return None
Loading

0 comments on commit 5bff5c2

Please sign in to comment.