Skip to content

Commit

Permalink
move param curation scripts and update scale factor parameters (#113)
Browse files Browse the repository at this point in the history
  • Loading branch information
qishipengqsp authored Oct 16, 2024
1 parent 946fc48 commit 570c508
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 37 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ sf*/
sf*.tar
sf*.tar.gz

paramgen/__pycache__/
tools/paramgen/__pycache__/
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

THRESH_HOLD = 0
THRESH_HOLD_6 = 0
TRUNCATION_LIMIT = 10000
TRUNCATION_LIMIT = 500
BATCH_SIZE = 5000
TIME_TRUNCATE = True

Expand Down
File renamed without changes.
File renamed without changes.
18 changes: 18 additions & 0 deletions scripts/run_paramgen.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar
OUTPUT_DIR=out/sf3/

# Note: generate factor tables with --generate-factors

echo "start factor table generation"

time spark-submit --master local[*] \
--class ldbc.finbench.datagen.LdbcDatagen \
--driver-memory 480g \
${LDBC_FINBENCH_DATAGEN_JAR} \
--output-dir ${OUTPUT_DIR} \
--factor-format csv \
--generate-factors

echo "start parameter curation"
45 changes: 11 additions & 34 deletions src/main/resources/scale_factors.xml
Original file line number Diff line number Diff line change
@@ -1,40 +1,17 @@
<?xml version="1.0"?>
<scale_factors>
<scale_factor name="0.01">
<property>
<name>generator.numPersons</name>
<value>800</value>
</property>
<property>
<name>generator.numCompanies</name>
<value>400</value>
</property>
<property>
<name>generator.numMediums</name>
<value>1000</value>
</property>
<property>
<name>transfer.minNumDegree</name>
<value>1</value>
</property>
<property>
<name>transfer.maxNumDegree</name>
<value>1000</value>
</property>
</scale_factor>

<scale_factor name="0.1">
<property>
<name>generator.numPersons</name>
<value>8000</value>
<value>1000</value>
</property>
<property>
<name>generator.numCompanies</name>
<value>4000</value>
<value>1000</value>
</property>
<property>
<name>generator.numMediums</name>
<value>10000</value>
<value>2000</value>
</property>
<property>
<name>transfer.minNumDegree</name>
Expand All @@ -49,15 +26,15 @@
<scale_factor name="0.3">
<property>
<name>generator.numPersons</name>
<value>24000</value>
<value>3000</value>
</property>
<property>
<name>generator.numCompanies</name>
<value>12000</value>
<value>3000</value>
</property>
<property>
<name>generator.numMediums</name>
<value>30000</value>
<value>6000</value>
</property>
<property>
<name>transfer.minNumDegree</name>
Expand Down Expand Up @@ -92,18 +69,18 @@
</property>
</scale_factor>

<!-- <scale_factor name="3">
<scale_factor name="3">
<property>
<name>generator.numPersons</name>
<value>240000</value>
<value>30000</value>
</property>
<property>
<name>generator.numCompanies</name>
<value>120000</value>
<value>30000</value>
</property>
<property>
<name>generator.numMediums</name>
<value>300000</value>
<value>60000</value>
</property>
<property>
<name>transfer.minNumDegree</name>
Expand All @@ -113,7 +90,7 @@
<name>transfer.maxNumDegree</name>
<value>1000</value>
</property>
</scale_factor> -->
</scale_factor>

<scale_factor name="10">
<property>
Expand Down
14 changes: 12 additions & 2 deletions tools/statistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@
import collections


def print_counts(counts):
labels = ["person","personOwnAccount","personApplyLoan","personGuarantee","personInvest","blank","company","companyOwnAccount","companyApplyLoan","companyGuarantee","companyInvest","blank","account","transfer","withdraw","blank","loan","loantransfer","deposit","repay","blank","medium","signIn"]

def print_original_counts(counts):
for key, value in collections.OrderedDict(sorted(counts.items())).items():
print("{}:{}".format(key, value))

def print_formatted_counts(counts):
for label in labels:
if label == "blank":
print("================================")
else:
print("{}:{}".format(label, counts[label]))

def count_entites(path):
counts = {}
Expand All @@ -18,7 +26,9 @@ def count_entites(path):
for file in glob.glob(os.path.join(subdir_path, "*.csv")):
num_entites += sum(1 for _ in open(file)) - 1
counts[subdir] = num_entites
print_counts(counts)
print_original_counts(counts)
print("\n========== Formatted Output ============\n")
print_formatted_counts(counts)


if __name__ == "__main__":
Expand Down

0 comments on commit 570c508

Please sign in to comment.