diff --git a/README.md b/README.md
index f699b34..892264e 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
## News
-- ![new](https://img.alicdn.com/imgextra/i4/O1CN01kUiDtl1HVxN6G56vN_!!6000000000764-2-tps-43-19.png) [09-02-2023] We published a non-anthropogenic dataset [earthquake](https://drive.google.com/drive/folders/1ubeIz_CCNjHyuu6-XXD0T-gdOLm12rf4), which contains timestamped earthquake events over the Conterminous U.S from 1998 to 2023!
+- ![new](https://img.alicdn.com/imgextra/i4/O1CN01kUiDtl1HVxN6G56vN_!!6000000000764-2-tps-43-19.png) [09-02-2023] We published two non-anthropogenic datasets [earthquake](https://drive.google.com/drive/folders/1ubeIz_CCNjHyuu6-XXD0T-gdOLm12rf4) and [volcano eruption](https://drive.google.com/drive/folders/1KSWbNi8LUwC-dxz1T5sOnd9zwAot95Tp?usp=drive_link)! See Dataset for details.
- [06-22-2023] Our paper [Language Model Can Improve Event Prediction by Few-Shot Abductive Reasoning](https://arxiv.org/abs/2305.16646) was accepted by the [Knowledge and Logical Reasoning Workshop, ICML'2023](https://klr-icml2023.github.io/cfp.html)!
- [05-29-2023] We released ``EasyTPP`` v0.0.1!
- [12-27-2022] Our paper [Bellman Meets Hawkes: Model-Based Reinforcement Learning via Temporal Point Processes](https://arxiv.org/abs/2201.12569) was accepted by AAAI'2023!
@@ -65,7 +65,12 @@ We preprocessed one synthetic and five real world datasets from widely-cited wor
- Taxi ([Whong, 2014](https://chriswhong.com/open-data/foil_nyc_taxi/)): timestamped taxi pick-up events.
- StackOverflow ([Leskovec, 2014](https://snap.stanford.edu/data/)): timestamped user badge reward events in StackOverflow.
- Taobao ([Xue et al, 2022](https://arxiv.org/abs/2210.01753)): timestamped user online shopping behavior events in Taobao platform.
-- Amazon ([Amazon Review, 2018](https://nijianmo.github.io/amazon/)): timestamped user online shopping behavior events in Amazon platform.
+- Amazon ([Xue et al, 2022](https://nijianmo.github.io/amazon/)): timestamped user online shopping behavior events in Amazon platform.
+
+Per users' request, we processed two non-anthropogenic datasets
+- [Earthquake](https://drive.google.com/drive/folders/1ubeIz_CCNjHyuu6-XXD0T-gdOLm12rf4): timestamped earthquake events over the Conterminous U.S from 1996 to 2023, processed from [USGS](https://www.usgs.gov/programs/earthquake-hazards/science/earthquake-data).
+- [Volcano eruption](https://drive.google.com/drive/folders/1KSWbNi8LUwC-dxz1T5sOnd9zwAot95Tp?usp=drive_link): timestamped volcano eruption events over the world in recent hundreds of years, processed from [The Smithsonian Institution](https://volcano.si.edu/).
+
All datasets are preprocess to the `Gatech` format dataset widely used for TPP researchers, and saved at [Google Drive](https://drive.google.com/drive/u/0/folders/1f8k82-NL6KFKuNMsUwozmbzDSFycYvz7) with a public access.
@@ -191,9 +196,9 @@ This project is licensed under the [Apache License (Version 2.0)](https://github
## Todo List [Back to Top]
-- [ ] New dataset:
- - [ ] Earthquake: the source data is available in [USGS](https://www.usgs.gov/programs/earthquake-hazards/science/earthquake-data).
- - [ ] Volcano eruption: the source data is available in [NCEI](https://www.ngdc.noaa.gov/hazard/volcano.shtml).
+- [x] New dataset:
+ - [x] Earthquake: the source data is available in [USGS](https://www.usgs.gov/programs/earthquake-hazards/science/earthquake-data).
+ - [x] Volcano eruption: the source data is available in [NCEI](https://www.ngdc.noaa.gov/hazard/volcano.shtml).
- [ ] New model:
- [ ] Meta Temporal Point Process, ICLR 2023.
- [ ] Model-based RL via TPP, AAAI 2022.
diff --git a/examples/script_data_processing/volcano.py b/examples/script_data_processing/volcano.py
new file mode 100644
index 0000000..c073631
--- /dev/null
+++ b/examples/script_data_processing/volcano.py
@@ -0,0 +1,105 @@
+import datetime
+import pickle
+import warnings
+
+import numpy as np
+import pandas as pd
+
+warnings.filterwarnings('ignore')
+
+
+def make_datetime(year, month, day):
+ try:
+ date = datetime.datetime(int(year), int(month), int(day))
+ except ValueError as e:
+ if e.args[0] == 'day is out of range for month':
+ date = datetime.datetime(int(year), int(month), int(day)-1)
+ return datetime.datetime.timestamp(date) + 61851630000 # make sure the timestamp is positive
+
+
+def clean_csv():
+ source_dir = 'events.csv'
+
+ df = pd.read_csv(source_dir, header=0)
+
+ df = df[~df['event_date_year'].isna()]
+ df = df[df['event_date_year'] > 0]
+ df['event_date_month'].fillna(1, inplace=True)
+ df['event_date_day'].fillna(1, inplace=True)
+ df.drop_duplicates(inplace=True)
+ norm_const = 1000000
+ df['event_timestamp'] = df.apply(
+ lambda x: make_datetime(x['event_date_year'], x['event_date_month'], x['event_date_day']),
+ axis=1)/norm_const
+ df.sort_values(by=['event_date_year', 'event_date_month', 'event_date_day'], inplace=True)
+ df['event_type'] = [0] * len(df)
+
+ df.to_csv('volcano.csv', index=False, header=True)
+ return
+
+
+def make_seq(df):
+ seq = []
+ df['time_diff'] = df['event_timestamp'].diff()
+ df.index = np.arange(len(df))
+ for index, row in df.iterrows():
+ if index == 0:
+ event_dict = {"time_since_last_event": 0.0,
+ "time_since_start": 0.0,
+ "type_event": row['event_type']
+ }
+ start_event_time = row['event_timestamp']
+ else:
+ event_dict = {"time_since_last_event": row['time_diff'],
+ "time_since_start": row['event_timestamp'] - start_event_time,
+ "type_event": row['event_type']
+ }
+ seq.append(event_dict)
+
+ return seq
+
+
+def make_pkl(target_dir, dim_process, split, seqs):
+ with open(target_dir, "wb") as f_out:
+ pickle.dump(
+ {
+ "dim_process": dim_process,
+ split: seqs
+ }, f_out
+ )
+ return
+
+
+def make_dataset(source_dir):
+ df = pd.read_csv(source_dir, header=0)
+
+ vols = np.unique(df['volcano_name'])
+ total_seq = []
+ for vol in vols:
+ df_ = df[df['volcano_name'] == vol]
+ df_.sort_values('event_timestamp', inplace=True)
+ total_seq.append(make_seq(df_))
+
+
+ print(len(total_seq))
+ make_pkl('train.pkl', 1, 'train', total_seq[:400])
+ count_seq(total_seq[:400])
+ make_pkl('dev.pkl', 1, 'dev', total_seq[400:450])
+ count_seq(total_seq[400:450])
+ make_pkl('test.pkl', 1, 'test', total_seq[450:])
+ count_seq(total_seq[450:])
+
+
+ return
+
+
+def count_seq(seqs):
+ total_len = [len(seq) for seq in seqs]
+ print(np.mean(total_len))
+ print(np.sum(total_len))
+
+ return
+
+if __name__ == '__main__':
+ # clean_csv()
+ make_dataset('volcano.csv')