diff --git a/.gitignore b/.gitignore index 0640ec27..53852d66 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,5 @@ sf*/ sf*.tar sf*.tar.gz +paramgen/__pycache__/ tools/paramgen/__pycache__/ \ No newline at end of file diff --git a/tools/paramgen/parameter_curation.py b/paramgen/parameter_curation.py similarity index 99% rename from tools/paramgen/parameter_curation.py rename to paramgen/parameter_curation.py index 6a4de363..fbf9c8b8 100644 --- a/tools/paramgen/parameter_curation.py +++ b/paramgen/parameter_curation.py @@ -24,7 +24,7 @@ THRESH_HOLD = 0 THRESH_HOLD_6 = 0 -TRUNCATION_LIMIT = 10000 +TRUNCATION_LIMIT = 500 BATCH_SIZE = 5000 TIME_TRUNCATE = True diff --git a/tools/paramgen/search_params.py b/paramgen/search_params.py similarity index 100% rename from tools/paramgen/search_params.py rename to paramgen/search_params.py diff --git a/tools/paramgen/time_select.py b/paramgen/time_select.py similarity index 100% rename from tools/paramgen/time_select.py rename to paramgen/time_select.py diff --git a/scripts/run_paramgen.sh b/scripts/run_paramgen.sh new file mode 100644 index 00000000..67138c61 --- /dev/null +++ b/scripts/run_paramgen.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar +OUTPUT_DIR=out/sf3/ + +# Note: generate factor tables with --generate-factors + +echo "start factor table generation" + +time spark-submit --master local[*] \ + --class ldbc.finbench.datagen.LdbcDatagen \ + --driver-memory 480g \ + ${LDBC_FINBENCH_DATAGEN_JAR} \ + --output-dir ${OUTPUT_DIR} \ + --factor-format csv \ + --generate-factors + +echo "start parameter curation" \ No newline at end of file diff --git a/src/main/resources/scale_factors.xml b/src/main/resources/scale_factors.xml index 18d70cdb..192a223e 100644 --- a/src/main/resources/scale_factors.xml +++ b/src/main/resources/scale_factors.xml @@ -1,40 +1,17 @@ - - - generator.numPersons - 800 - - - generator.numCompanies - 400 - - - generator.numMediums - 1000 - - - transfer.minNumDegree - 1 - - - transfer.maxNumDegree - 1000 - - - generator.numPersons - 8000 + 1000 generator.numCompanies - 4000 + 1000 generator.numMediums - 10000 + 2000 transfer.minNumDegree @@ -49,15 +26,15 @@ generator.numPersons - 24000 + 3000 generator.numCompanies - 12000 + 3000 generator.numMediums - 30000 + 6000 transfer.minNumDegree @@ -92,18 +69,18 @@ - + diff --git a/tools/statistic.py b/tools/statistic.py index 39623a03..45e5a2c9 100644 --- a/tools/statistic.py +++ b/tools/statistic.py @@ -4,10 +4,18 @@ import collections -def print_counts(counts): +labels = ["person","personOwnAccount","personApplyLoan","personGuarantee","personInvest","blank","company","companyOwnAccount","companyApplyLoan","companyGuarantee","companyInvest","blank","account","transfer","withdraw","blank","loan","loantransfer","deposit","repay","blank","medium","signIn"] + +def print_original_counts(counts): for key, value in collections.OrderedDict(sorted(counts.items())).items(): print("{}:{}".format(key, value)) +def print_formatted_counts(counts): + for label in labels: + if label == "blank": + print("================================") + else: + print("{}:{}".format(label, counts[label])) def count_entites(path): counts = {} @@ -18,7 +26,9 @@ def count_entites(path): for file in glob.glob(os.path.join(subdir_path, "*.csv")): num_entites += sum(1 for _ in open(file)) - 1 counts[subdir] = num_entites - print_counts(counts) + print_original_counts(counts) + print("\n========== Formatted Output ============\n") + print_formatted_counts(counts) if __name__ == "__main__":