Add CI test for all peptide cases

pyMBE-dev · Mar 4, 2024 · ab716fa · ab716fa
1 parent 7338fa6
commit ab716fa
Show file tree

Hide file tree

Showing 9 changed files with 141 additions and 271 deletions.
diff --git a/Makefile b/Makefile
@@ -2,7 +2,6 @@
 .PHONY: sample
 .PHONY: visual 
 .PHONY: clean
-.PHONY: tests
 .PHONY: testsuite
 .PHONY: docs
 
@@ -12,7 +11,7 @@ docs:
 	pdoc ./pyMBE.py -o ./docs --docformat google 
 
 testsuite:
-	${ESPResSo_build_path}/pypresso testsuite/LYS_ASP_peptide.py
+	${ESPResSo_build_path}/pypresso testsuite/peptide_tests.py
 
 sample:
 	${ESPResSo_build_path}/pypresso sample_scripts/peptide_simulation_example.py
@@ -22,17 +21,4 @@ visual:
 	vmd -e visualization.tcl
 
 tutorial:
-	${ESPResSo_build_path}/ipypresso notebook sugar_tutorial.ipynb
-
-tests_peptide:
-	 ${ESPResSo_build_path}/pypresso tests/LYS_ASP_peptide.py
-	 ${ESPResSo_build_path}/pypresso tests/GLU_HIS_peptide.py
-	 ${ESPResSo_build_path}/pypresso tests/histatin5_peptide.py
-
-tests_globular_protein:
-	python3 tests/run_test_protein.py --pdb_code 1beb --run_command "${ESPResSo_build_path}/pypresso sample_scripts/globular_protein.py  --pdb 1beb --path_to_cg reference_parameters/coarse_grained_structures/1beb.vtf"
-	python3 tests/run_test_protein.py --pdb_code 1f6s --run_command "${ESPResSo_build_path}/pypresso  sample_scripts/globular_protein.py  --pdb 1f6s --metal_ion_name Ca --metal_ion_charge 2 --path_to_cg reference_parameters/coarse_grained_structures/1f6s.vtf"
-
-tests:
-	make tests_peptide
-	make tests_globular_protein
+	${ESPResSo_build_path}/ipypresso notebook sugar_tutorial.ipynb
diff --git a/lib/analysis.py b/lib/analysis.py
@@ -24,6 +24,39 @@ def add_data_to_df(df, data_dict, index):
                          index=index)])
     return updated_df
 
+def analyze_time_series(path_to_datafolder):
+    """
+    Analyzes all time series stored in `path_to_datafolder` using the block binning method.
+
+    Args:
+        path_to_datafolder(`str`): path to the folder with the files with the time series
+
+    Returns:
+        (`obj`): pandas dataframe with
+
+    """
+    data=pd.DataFrame()
+    with os.scandir(path_to_datafolder) as subdirectory:
+        # Gather all data
+        for subitem in subdirectory:
+            if subitem.is_file():
+                if 'time_series' in subitem.name:
+                    # Get parameters from the file name
+                    data_dict=get_params_from_dir_name(subitem.name.replace('_time_series.csv', ''))
+                    # Get the observables for binning analysis
+                    time_series_data=read_csv_file(path=f"{path_to_datafolder}/{subitem.name}")
+                    analyzed_data=block_analyze(full_data=time_series_data)
+                    value_list=[]
+                    index_list=[]
+                    for key in data_dict.keys():
+                        value_list.append(data_dict[key])
+                        index_list.append((key,))
+                    analyzed_data = pd.concat([pd.Series(value_list, index=index_list), analyzed_data])
+                    data = add_data_to_df(df=data,
+                                        data_dict=analyzed_data.to_dict(),
+                                        index=[len(data)])   
+    return data
+
 def do_binning_analysis(list_time_series_df, frac_data_to_discard=0.3):
     """
     Does a binning analysis of all Pandas DataFrame objects in `list_time_series_df`.
@@ -77,9 +110,6 @@ def merge_time_series_dfs(list_time_series_df,frac_data_to_discard=0,rescale_tim
             binning_df["time"]+=index*binning_df.shape[0]*dt
         list_time_series_df[index]=binning_df
 
-
-
-
     # Join all the dataframes for binning analysis
     gathered_binning_df=pd.concat(list_time_series_df)
 
@@ -154,8 +184,6 @@ def split_dataframe(df,n_blocks):
 
 
     # Blocks of size 2 (s2) =  df.shape[0]//n_blocks
-
-    n_blocks_s2= n_blocks - n_blocks_s1
     block_size_s2=df.shape[0]//n_blocks
     blocks+=split_dataframe_in_equal_blocks(df=df,
                                            start_row=n_blocks_s1*block_size_s1,
@@ -173,40 +201,30 @@ def get_time_series_from_average_df(pd_series, label):
     
     Returns:
         (`obj`): PandasDataFrame with the time series and their statistical error
-
-    Authors:
-        - Pablo M. Blanco, Norwegian University of Science and Technology
     """
     import numpy as np
     dist_dict={}
     expected_labels=["mean", "errmean", "nsamples"]
-
     for data_str in pd_series[label]:
         clean_data_str=data_str.strip("{").strip("}")
         for data_set in clean_data_str.split("]"):
             data_list=data_set.split(":")
             if len(data_list) == 1:
-                continue
-
+                continue            
             # Parse data and label
             data=list(data_list[1][2:].split(","))
             data=np.array([float(x) for x in data])
-
             clean_label=data_list[0].strip("'").strip(",").strip()
             label_sts=clean_label.split("_")[-1]
-            label_qty=clean_label[:-len(label_sts)-1].strip("'")
-
+            label_qty=clean_label[:-len(label_sts)-1].strip("'")            
             if label_sts not in expected_labels:
                 raise ValueError(f"Error while parsing the df, found label for stats {label_sts}")
 
             if label_qty in dist_dict.keys():
                 dist_dict[label_qty][label_sts]=data
             else:
                 dist_dict[label_qty]={}
-                dist_dict[label_qty][label_sts]=data
-
-
-
+                dist_dict[label_qty][label_sts]=data 
     return pd.DataFrame.from_dict(dist_dict)
 
 
@@ -224,13 +242,6 @@ def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1,  columns
 
     Returns:
         `result`: pandas dataframe with the mean (mean), statistical error (err_mean), number of effective samples (n_eff) and correlation time (tau_int) of each observable.
-
-    Note:
-        This function should not be modified without consulting first its authors.
-
-    Authors:
-        - Peter Kosovan, Charles University
-        - Pablo M. Blanco, Norwegian University of Science and Technology
     """
 
     dt = get_dt(full_data) # check that the data was stored with the same time interval dt
@@ -255,8 +266,6 @@ def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1,  columns
         print(f"n_blocks b = {n_blocks},  block_size k = {block_size}")
 
     # calculate the mean per each block
-
-    series_data = pd.Series(data.iat[0, 0], index=data.columns)
     blocks = split_dataframe(df=data,
                              n_blocks=n_blocks)
 
@@ -271,9 +280,10 @@ def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1,  columns
     err_mean = np.sqrt(var_blocks/n_blocks) # standard error of the mean by eq.(37) of Janke
     tau_int = dt*block_size * var_blocks / var_all /2.# eq.(38) of Janke
     n_eff = n_samples / (2*tau_int/dt) # effective number of samples in the whole simulation using eq.(28) of Janke
+
     # concatenate the observables and atribute a key for each (unique index)
-    result = pd.concat( [ mean, err_mean, n_eff, tau_int], keys= [ "mean", "err_mean", "n_eff", "tau_int" ])
-    result = pd.concat( [ pd.Series({"n_blocks":n_blocks,"block_size":block_size}), result] )
+    result = pd.concat( [ mean, err_mean, n_eff, tau_int], keys= [ "mean", "err_mean", "n_eff", "tau_int" ], join="inner")
+    result = pd.concat( [ pd.Series([n_blocks,block_size], index=[('n_blocks',),('block_size',)]), result])
     return result
 
 def get_dt(data):
@@ -373,7 +383,6 @@ def create_histogram_df_from_distribution_list(distribution_list, start, end, nb
         cnt+=5000
     return  pd.DataFrame.from_dict(dict_hist)
 
-
 def find_index_with_value_in_df(df,column_name, value, tol=0.01):
     """
     Finds the index in the pandas DataFrame `df` with a column `column_name` and a row `value`.
@@ -386,7 +395,6 @@ def find_index_with_value_in_df(df,column_name, value, tol=0.01):
 
     Returns:
         index (int): Index found.
-
     """
     index = np.where(abs(df[column_name]-value)/value < tol)
     return index[0]
@@ -403,9 +411,6 @@ def built_output_name(input_dict):
 
     Note:
         The standard formatting rule is parametername1-parametervalue1_parametername2-parametervalue2
-    
-    Authors:
-        - Pablo M. Blanco, Norwegian University of Science and Technology (NTNU) 
     """
     output_name=""
     for label in input_dict:

diff --git a/samples/Beyer2024/create_paper_data.py b/samples/Beyer2024/create_paper_data.py
@@ -40,7 +40,6 @@
     raise ValueError(f"Mode {mode} is not currently supported, valid modes are {valid_modes}")
 
 ## Peptide plots (Fig. 6)
-
 labels_fig6=["6a", "6b", "6c"]
 
 if fig_label in labels_fig6:
@@ -54,40 +53,19 @@
     else:
         raise RuntimeError()
     pH_range = np.linspace(2, 12, num=21)
-
     for pH in pH_range:
         run_command=f"python3 {script_path} --sequence {sequence} --pH {pH} --mode {mode}"
         print(run_command)
         os.system(run_command)
 
-# Read all files in the subdir
-data_files=[]
+# Analyze all time series
 time_series_folder_path=pmb.get_resource(f"samples/Beyer2024/time_series")
+data=analysis.analyze_time_series(path_to_datafolder=time_series_folder_path)
 
-data=pd.DataFrame()
-
-with os.scandir(time_series_folder_path) as subdirectory:
-    # Gather all data
-    for subitem in subdirectory:
-        if subitem.is_file():
-            if 'time_series' in subitem.name:
-                # Get parameters from the file name
-                data_dict=analysis.get_params_from_dir_name(subitem.name.replace('_time_series.csv', ''))
-                file_data=pd.DataFrame(data_dict, index=[0])
-                # Get the observables for binning analysis
-                time_series_data=analysis.read_csv_file(path=f"{time_series_folder_path}/{subitem.name}")
-                analyzed_data=analysis.block_analyze(full_data=time_series_data)
-                data_dict.update(analyzed_data.to_dict())
-                data = analysis.add_data_to_df(df=data,
-                                               data_dict=data_dict,
-                                               index=[len(data)])
-                for param in data_dict.keys():
-                    analyzed_data[param]=data_dict[param]
-
+# Store mean values and other statistics
 data_path=pmb.get_resource("samples/Beyer2024/")+"data"
 if not os.path.exists(data_path):
     os.makedirs(data_path)
-
 data.to_csv(f"{data_path}/fig{fig_label}.csv")
 
 # Plot the data

diff --git a/samples/Beyer2024/peptide.py b/samples/Beyer2024/peptide.py
@@ -19,6 +19,7 @@
 # Create an instance of pyMBE library
 pmb = pyMBE.pymbe_library()
 
+valid_modes=["short-run","long-run", "test"]
 parser = argparse.ArgumentParser(description='Script to run the peptide test cases for pyMBE')
 parser.add_argument('--sequence', 
                     type=str, 
@@ -35,7 +36,7 @@
 parser.add_argument('--mode', 
                     type=str, 
                     default= "short-run",  
-                    help='sets for how long the simulation runs, valid modes are "short-run" and "long-run"')
+                    help='sets for how long the simulation runs, valid modes are {valid_modes}')
 args = parser.parse_args()
 
 # Inputs
@@ -46,7 +47,6 @@
 
 mode=args.mode
 
-valid_modes=["short-run","long-run"]
 if mode not in valid_modes:
     raise ValueError(f"Mode {mode} is not currently supported, valid modes are {valid_modes}")
 
@@ -55,9 +55,12 @@
     Nsamples = 1000
     MD_steps_per_sample = 1000
 
-if mode == "long-run":
+elif mode == "long-run":
     Nsamples = 5000
     MD_steps_per_sample = 5000
+elif mode == "test":
+    Nsamples = 500
+    MD_steps_per_sample = 700
 
 SEED = 100
 dt = 0.01