From 6573e172fb03296cf0f5b0089e6f1ecb1b17687b Mon Sep 17 00:00:00 2001
From: blancoapa <apple.pablo@hotmail.com>
Date: Fri, 1 Mar 2024 18:22:09 +0100
Subject: [PATCH] Add script to standarize reference data sets, add plotting
 functionality to reproduce the paper data

---
 mantainer/standarize_data.py                  |  62 ++++++++
 samples/Beyer2024/create_paper_data.py        | 141 +++++++++++++++---
 samples/Beyer2024/peptide.py                  |   2 +-
 testsuite/data/{ => src}/1beb-10mM-torres.dat |   0
 testsuite/data/{ => src}/1f6s-10mM-torres.dat |   0
 testsuite/data/{ => src}/Glu-HisMSDE.csv      |   0
 testsuite/data/{ => src}/Lys-AspMSDE.csv      |   0
 testsuite/data/{ => src}/data_landsgesell.csv |   0
 .../{ => src}/excess_chemical_potential.dat   |   0
 .../data/{ => src}/histatin5_SoftMatter.txt   |   0
 10 files changed, 184 insertions(+), 21 deletions(-)
 create mode 100644 mantainer/standarize_data.py
 rename testsuite/data/{ => src}/1beb-10mM-torres.dat (100%)
 rename testsuite/data/{ => src}/1f6s-10mM-torres.dat (100%)
 rename testsuite/data/{ => src}/Glu-HisMSDE.csv (100%)
 rename testsuite/data/{ => src}/Lys-AspMSDE.csv (100%)
 rename testsuite/data/{ => src}/data_landsgesell.csv (100%)
 rename testsuite/data/{ => src}/excess_chemical_potential.dat (100%)
 rename testsuite/data/{ => src}/histatin5_SoftMatter.txt (100%)

diff --git a/mantainer/standarize_data.py b/mantainer/standarize_data.py
new file mode 100644
index 0000000..f448abf
--- /dev/null
+++ b/mantainer/standarize_data.py
@@ -0,0 +1,62 @@
+# Import pyMBE and other libraries
+import pyMBE
+import numpy as np
+import pandas as pd
+import argparse 
+
+# Create an instance of pyMBE library
+pmb = pyMBE.pymbe_library()
+
+# Expected inputs
+supported_filenames=["Glu-HisMSDE.csv",
+                     "Lys-AspMSDE.csv",
+                     "histatin5_SoftMatter.txt"]
+
+parser = argparse.ArgumentParser(description='Script to standarize the data from various authors')
+parser.add_argument('--src_filename', 
+                    type=str, 
+                    required= True,  
+                    help='File name from testsuite/data/src. Currently supported {supported_filenames}')
+args = parser.parse_args()
+
+# Inputs
+filename=args.src_filename
+
+# Outputs
+output_filenames={"Lys-AspMSDE.csv": "Lunkad2021a.csv",
+                  "Glu-HisMSDE.csv": "Lunkad2021b.csv",
+                  "histatin5_SoftMatter.txt": "Blanco2020a.csv"}
+
+# Sanity checks
+if filename not in supported_filenames:
+    ValueError(f"Filename {filename} not supported, supported files are {supported_filenames}")
+
+# Extact the data from Ref.
+ref_path=pmb.get_resource(f"testsuite/data/src/{filename}")
+Refs_lunkad=["Glu-HisMSDE.csv","Lys-AspMSDE.csv"]
+Ref_blanco=["histatin5_SoftMatter.txt"]
+
+if filename in Refs_lunkad:
+    data=pd.read_csv(ref_path)
+    Z_ref = 5*-1*data['aaa']+5*data['aab']
+    # Error propagation calculation
+    # 1/4 factor added to correct for bug in the original calculation of the error reported by the authors       
+    Z_ref_err = 5/4*np.sqrt((data['eaa'])**2+(data['eab'])**2)   
+    
+elif filename in Ref_blanco:
+    data=np.loadtxt(ref_path, delimiter=",")
+    Z_ref=data[:,1]         
+    Z_ref_err=data[:,2]
+else:
+    raise RuntimeError()
+
+pH_range = np.linspace(2, 12, num=21)
+
+# Store the data
+data=pd.DataFrame({"pH": pH_range,
+                  "charge": Z_ref,
+                  "charge_error": Z_ref_err})
+
+data_path=pmb.get_resource(f"testsuite/data")
+data.to_csv(f"{data_path}/{output_filenames[filename]}", 
+            index=False)
\ No newline at end of file
diff --git a/samples/Beyer2024/create_paper_data.py b/samples/Beyer2024/create_paper_data.py
index 28b3991..d0085de 100644
--- a/samples/Beyer2024/create_paper_data.py
+++ b/samples/Beyer2024/create_paper_data.py
@@ -1,42 +1,37 @@
-# Load espresso, sugar and other necessary libraries
-import sys
+# Import pyMBE and other libraries
+import pyMBE
+from lib import analysis
 import os 
-import inspect
-import espressomd
 import numpy as np
 import pandas as pd
 import argparse 
-from tqdm import tqdm
-from espressomd.io.writer import vtf
-from espressomd import interactions
-from espressomd import electrostatics
-
-# Import pyMBE
-import pyMBE
-from lib import analysis
-from lib import handy_functions as hf
 
 # Create an instance of pyMBE library
 pmb = pyMBE.pymbe_library()
 
+valid_fig_labels=["6a", "6b", "6c"]
+
 parser = argparse.ArgumentParser(description='Script to create the data from Beyer2024')
 parser.add_argument('--fig_label', 
                     type=str, 
                     required= True,  
-                    help='Label of the corresponding figure in Beyer2024, currently supported: 6a, 6b, 6c.')
+                    help=f'Label of the corresponding figure in Beyer2024, currently supported: {valid_fig_labels}')
 parser.add_argument('--mode', 
                     type=str, 
                     default= "long-run",  
-                    help='sets for how long the simulation runs, valid modes are "short-run" and "long-run"')
+                    help='Sets for how long the simulation runs, valid modes are "short-run" and "long-run"')
+parser.add_argument('--plot', 
+                    type=bool, 
+                    default= False,  
+                    help='Switch to activate/deactivate to plot the data"')
 args = parser.parse_args()
 
 # Inputs
 fig_label=args.fig_label
 mode=args.mode
+plot=args.plot
 
 # Sanity checks
-valid_fig_labels=["6a", "6b", "6c"]
-
 if fig_label not in valid_fig_labels:
     raise ValueError(f"The figure label {fig_label} is not supported. Supported figure labels are {valid_fig_labels}")
 
@@ -45,7 +40,10 @@
     raise ValueError(f"Mode {mode} is not currently supported, valid modes are {valid_modes}")
 
 ## Peptide plots (Fig. 6)
-if fig_label in ["6a", "6b", "6c"]:
+
+labels_fig6=["6a", "6b", "6c"]
+
+if fig_label in labels_fig6:
     script_path=pmb.get_resource(f"samples/Beyer2024/peptide.py")
     if fig_label == "6a":
         sequence="K"*5+"D"*5
@@ -79,8 +77,12 @@
                 # Get the observables for binning analysis
                 time_series_data=analysis.read_csv_file(path=f"{time_series_folder_path}/{subitem.name}")
                 analyzed_data=analysis.block_analyze(full_data=time_series_data)
-                file_data=pd.concat([file_data,analyzed_data])
-                data = pd.concat([data,file_data])
+                data_dict.update(analyzed_data.to_dict())
+                data = analysis.add_data_to_df(df=data,
+                                               data_dict=data_dict,
+                                               index=[len(data)])
+                for param in data_dict.keys():
+                    analyzed_data[param]=data_dict[param]
 
 data_path=pmb.get_resource("samples/Beyer2024/")+"data"
 if not os.path.exists(data_path):
@@ -88,7 +90,106 @@
 
 data.to_csv(f"{data_path}/fig{fig_label}.csv")
 
+# Plot the data
+if plot:
+
+    # Import matplotlib for plotting
+    import matplotlib.pyplot as plt
+    import matplotlib as mpl
+    
+    plt.rcParams["font.family"] = "serif"
+    plt.tight_layout()
+    mpl.rc('axes', linewidth=1)
+    mpl.rcParams['lines.markersize'] = 5
+    mpl.rcParams['lines.linewidth'] = 1.0
+
+    width = 10
+    height = 10
+    plt.rc('font', size=22)
+    plt.figure(figsize=(width,height), dpi=100)
+    plt.grid(which='major', 
+             color='#CCCCCC', 
+             linestyle='--', 
+             linewidth=0.6)
+
+    # Set labels for the axes
+    if fig_label in labels_fig6:
+        plt.ylabel(r"Net charge $Z/e$")
+        plt.xlabel(r"pH in the solution")
+    else:
+        raise RuntimeError()
 
+    # Load pka set
+    if fig_label in ["6a","6b"]:
+        pka_path=pmb.get_resource("parameters/pka_sets/CRC1991.txt")
+    elif fig_label == "6c":
+        pka_path=pmb.get_resource("parameters/pka_sets/Nozaki1967.txt")
+    else:
+        raise RuntimeError()
+    pmb.load_pka_set (filename=pka_path)
+    
+    # Load ref data    
+    if fig_label == "6a":
+        ref_path=pmb.get_resource("testsuite/data/Lunkad2021a.csv")
+    elif fig_label == "6b":
+        ref_path=pmb.get_resource("testsuite/data/Lunkad2021b.csv")
+    elif fig_label == "6c":
+        ref_path=pmb.get_resource("testsuite/data/Blanco2020a.csv")
+    else:
+        raise RuntimeError()
+    
+    ref_data=analysis.read_csv_file(path=ref_path)
+    
+    # Calculate Henderson-Hasselbalch (HH
+    if fig_label in labels_fig6:
+        pmb.define_peptide (name=sequence, 
+                            sequence=sequence,
+                            model="1beadAA")
+    
+    pH_range_HH = np.linspace(2, 12, num=1000)
+    Z_HH = pmb.calculate_HH(object_name=sequence,
+                            pH_list=pH_range_HH)
+
+    # Plot HH
+    plt.plot(pH_range_HH,
+               Z_HH, 
+               label=r"HH", 
+               color="black")
+    
+    # Plot Ref data
+    ref_data=ref_data.sort_values(by="pH",ascending=True)
+    plt.errorbar(ref_data["pH"], 
+                   ref_data["charge"], 
+                   ref_data["charge_error"], 
+                   linestyle="none", 
+                   marker="s", 
+                   label="Lunkad  et al.", 
+                   ms=15,
+                   color="C0")
+    
+    # Style for Blanco linestyle="none", marker="^", label="Blanco et al.", color="green",  markeredgewidth=1.5
+    # Plot data produced by pyMBE
+    
+    data=data.astype({'pH': 'float'}).sort_values(by="pH")
+    plt.errorbar(data["pH"], 
+                   data["mean","charge"], 
+                   yerr=data["err_mean","charge"], 
+                   linestyle="none", 
+                   marker="o", 
+                   label="pyMBE", 
+                   color="C1", 
+                   fillstyle="none",
+                   ms=15, 
+                   markeredgewidth=1.5)
+    
+    # Save plot
+    fig_path=pmb.get_resource("samples/Beyer2024")+"/figs"
+    if not os.path.exists(fig_path):
+        os.makedirs(fig_path)
+    plt.legend()
+    plt.savefig(f"{fig_path}/{fig_label}.png", 
+                bbox_inches='tight')
+    plt.close()
 
 
 
diff --git a/samples/Beyer2024/peptide.py b/samples/Beyer2024/peptide.py
index 73179b0..24b8d91 100644
--- a/samples/Beyer2024/peptide.py
+++ b/samples/Beyer2024/peptide.py
@@ -205,7 +205,7 @@
 time_series=pd.DataFrame(time_series)
 filename=analysis.built_output_name(input_dict=inputs)
 
-time_series.to_csv(f"{data_path}/{filename}_time_series.csv")
+time_series.to_csv(f"{data_path}/{filename}_time_series.csv", index=False)
 
 if args.verbose:
     print("*** DONE ***")
diff --git a/testsuite/data/1beb-10mM-torres.dat b/testsuite/data/src/1beb-10mM-torres.dat
similarity index 100%
rename from testsuite/data/1beb-10mM-torres.dat
rename to testsuite/data/src/1beb-10mM-torres.dat
diff --git a/testsuite/data/1f6s-10mM-torres.dat b/testsuite/data/src/1f6s-10mM-torres.dat
similarity index 100%
rename from testsuite/data/1f6s-10mM-torres.dat
rename to testsuite/data/src/1f6s-10mM-torres.dat
diff --git a/testsuite/data/Glu-HisMSDE.csv b/testsuite/data/src/Glu-HisMSDE.csv
similarity index 100%
rename from testsuite/data/Glu-HisMSDE.csv
rename to testsuite/data/src/Glu-HisMSDE.csv
diff --git a/testsuite/data/Lys-AspMSDE.csv b/testsuite/data/src/Lys-AspMSDE.csv
similarity index 100%
rename from testsuite/data/Lys-AspMSDE.csv
rename to testsuite/data/src/Lys-AspMSDE.csv
diff --git a/testsuite/data/data_landsgesell.csv b/testsuite/data/src/data_landsgesell.csv
similarity index 100%
rename from testsuite/data/data_landsgesell.csv
rename to testsuite/data/src/data_landsgesell.csv
diff --git a/testsuite/data/excess_chemical_potential.dat b/testsuite/data/src/excess_chemical_potential.dat
similarity index 100%
rename from testsuite/data/excess_chemical_potential.dat
rename to testsuite/data/src/excess_chemical_potential.dat
diff --git a/testsuite/data/histatin5_SoftMatter.txt b/testsuite/data/src/histatin5_SoftMatter.txt
similarity index 100%
rename from testsuite/data/histatin5_SoftMatter.txt
rename to testsuite/data/src/histatin5_SoftMatter.txt