CDL-Quantum · FoggyBrain · Jul 25, 2022 · Jul 25, 2022 · Jul 26, 2022 · Jul 26, 2022
diff --git a/QML4FRAUD/.DS_Store b/QML4FRAUD/.DS_Store
diff --git a/QML4FRAUD/.ipynb_checkpoints/Architecture-checkpoint.ipynb b/QML4FRAUD/.ipynb_checkpoints/Architecture-checkpoint.ipynb
diff --git a/QML4FRAUD/.ipynb_checkpoints/Experimentation-checkpoint.ipynb b/QML4FRAUD/.ipynb_checkpoints/Experimentation-checkpoint.ipynb
diff --git a/QML4FRAUD/.ipynb_checkpoints/Solution via architecture-checkpoint.ipynb b/QML4FRAUD/.ipynb_checkpoints/Solution via architecture-checkpoint.ipynb
diff --git a/QML4FRAUD/Business_application.md b/QML4FRAUD/Business_application.md
@@ -0,0 +1,23 @@
+# Business Application
+
+Machine learning is ubiquitous in many of the best solutions to industrial problems. Quantum computing is proposed to enhance machine learning. We have used the case of fraud detection as a demonstration to show that using LDA as a preprocessing method with a variational classifier could enhance the QML model's performance and allow it to process a lot more features. And we believe that this could be further improved in the future. 
+
+Since this is an improvement based on existing machine learning models, it has direct application in real cases. In principle, this could eventually be generalized for any multi-class classification problem. 
+
+We are seeing that the finance sector is always within the top 3 industries related to quantum computing use cases and benefits. Example problems that our approach could be applied to include: 
+
+- Credit scoring
+- Fraud detection
+- Default forecasting
+- Churn prediction
+- Loan approval
+
+and many more. 
+
+In other industries, improvement of classification also has many benefits. A quick but not complete list would include:
+
+- Energy: energy loss detection.
+- E-commerce: recommendation system.
+- Several industries: forecasting.
+- Healthcare: diagnosis.
+
diff --git a/QML4FRAUD/Factory/__pycache__/data.cpython-39.pyc b/QML4FRAUD/Factory/__pycache__/data.cpython-39.pyc
diff --git a/QML4FRAUD/Factory/__pycache__/model.cpython-39.pyc b/QML4FRAUD/Factory/__pycache__/model.cpython-39.pyc
diff --git a/QML4FRAUD/Factory/data.py b/QML4FRAUD/Factory/data.py
@@ -0,0 +1,187 @@
+import pandas as pd
+import numpy as np
+import copy
+from pennylane import numpy as np
+from sklearn.preprocessing import normalize
+from sklearn.preprocessing import StandardScaler
+
+import pennylane as qml
+#from pennylane_qiskit import IBMQDevice
+#from pennylane_qiskit import BasicAerDevice
+from pennylane.templates.embeddings import AngleEmbedding, AmplitudeEmbedding
+from pennylane.optimize import AdamOptimizer
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from sklearn import metrics
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
+
+import time
+
+class PrepareData:
+    def __init__(self, data, target, sample_size = 0, test_split = 0.3, seed = 10):
+        self.data = data
+        self.preprocess_done = None
+
+        if sample_size == 0:
+            self.data_sample = data
+        else:
+            self.data_sample = data.sample(sample_size)
+
+        self.train_set, self.test_set = train_test_split(self.data_sample, test_size=test_split, random_state=seed)
+
+        self.y_train = self.train_set[[target]]
+        self.y_test = self.test_set[[target]]
+
+        self.x_train = self.train_set.drop(target, axis=1)
+        self.x_test = self.test_set.drop(target, axis=1)
+
+        self.x_train = self.x_train.loc[:, self.x_train.any()]
+        corr = self.x_train.corr()
+        values_dont_change = corr.isnull().values
+        for i in range(len(values_dont_change)):
+            if values_dont_change[0][i]:
+                index_to_drop = i
+                self.x_train = self.x_train.drop(self.x_train.columns[index_to_drop], axis = 1)
+
+        self.x_test = self.x_test[self.x_train.columns]
+
+    def view_info(self):
+        print(self.x_train.info())
+        if self.preprocess_done == None:
+            print("No preprocessing done yet.")
+        else:
+            print("Preprocessing done via: ", self.preprocess_done)
+        return self.data_sample.describe()
+
+    def get_preprocessed(self, to_show = False):
+        if self.preprocess_done == None:
+            print("Please do some preprocessing first.")
+        else:
+
+            if to_show:
+                print("Training Set and Labels: ")
+                print(self.train_X_preprocessed)
+                print(self.train_Y_preprocessed)
+
+                print("Test Set and Labels: ")
+                print(self.test_X_preprocessed)
+                print(self.test_Y_preprocessed)
+
+            return self.train_X_preprocessed, self.train_Y_preprocessed, self.test_X_preprocessed, self.test_Y_preprocessed
+
+    def perform_LDA(self, n_dim = 2):
+
+        self.preprocess_done = "LDA"
+        print("Performing LDA...")
+
+        length = len(self.x_train.columns)
+        split_feature = int(length/n_dim)
+        features_train = []
+        features_test = []
+
+        # Split Features (for Yaqi to change)
+        group_columns = []
+        df_clean_train_copy = copy.deepcopy(self.x_train)
+        group_size = int(len(df_clean_train_copy.columns) / n_dim)
+
+        for i in range(n_dim):
+            corr_train = df_clean_train_copy.corr()
+
+            max_value = 0
+            for j in df_clean_train_copy.columns:
+                #print(corr_train[j].abs().sort_values(ascending=False))
+                if corr_train[j].abs().sort_values(ascending=False)[group_size - 1] >= max_value:
+                    saved = j
+                    max_value = corr_train[j].abs().sort_values(ascending=False)[group_size - 1]
+
+            #print(saved)
+            #print(max_value)
+            #print(corr_train)
+            #print(df_clean_train_copy.columns)
+            #print(corr_train.columns)
+            indices = corr_train[saved].abs().sort_values(ascending=False).index
+            df_clean_train_copy = df_clean_train_copy[indices]
+            new_columns = df_clean_train_copy.columns[:group_size]
+            group_columns.append(new_columns)
+            df_clean_train_copy = df_clean_train_copy.iloc[:, (group_size):]
+
+        for i in range(n_dim):
+            new_set_train = self.x_train[group_columns[i]]
+            features_train.append(new_set_train)
+
+            new_set_test = self.x_test[group_columns[i]]
+            features_test.append(new_set_test)
+
+        # Run the LDA
+        features_lda_train = []
+        features_lda_test = []
+        LDA_transformations = []
+
+        for i in range(n_dim):
+            lda = LDA(n_components= 1)
+            features_lda_train_new = lda.fit_transform(features_train[i], self.y_train)
+            features_lda_train.append(pd.DataFrame(features_lda_train_new))
+            LDA_transformations.append(lda)
+
+            features_lda_test_new = lda.transform(features_test[i])
+            features_lda_test.append(pd.DataFrame(features_lda_test_new))
+
+        x_train_data = features_lda_train[0]
+        x_test_data = features_lda_test[0]
+        self.transformations = LDA_transformations
+
+        # Join the results together
+        for i in range(1, n_dim):
+            l_suffix = "_" + str(i)
+            r_suffix = "_" + str(i+1)
+            x_train_data = x_train_data.join(features_lda_train[i], lsuffix=l_suffix, rsuffix=r_suffix)
+            x_test_data = x_test_data.join(features_lda_test[i], lsuffix=l_suffix, rsuffix=r_suffix)
+
+        # Normalize
+        std_scale_train = StandardScaler().fit(x_train_data)
+        x_train_data = std_scale_train.transform(x_train_data)
+
+        x_test_data = std_scale_train.transform(x_test_data)
+
+        # shift label from {0, 1} to {-1, 1}
+        self.train_X_preprocessed = np.array(x_train_data, requires_grad=False)
+        self.train_Y_preprocessed = np.array(self.y_train.values[:,0] * 2 - np.ones(len(self.y_train.values[:,0])), requires_grad = False)
+
+        self.test_X_preprocessed = np.array(x_test_data, requires_grad=False)
+        self.test_Y_preprocessed = np.array(self.y_test.values[:,0] * 2 - np.ones(len(self.y_test.values[:,0])), requires_grad = False)
+
+    def perform_PCA(self, n_dim = 2):
+
+        self.preprocess_done = "PCA"
+        print("Performing PCA...")
+
+        self.y_train.value_counts(normalize=True)*100
+        self.y_test.value_counts(normalize=True)*100
+
+        pca = PCA(n_components=n_dim, svd_solver='full')
+        pca.fit(self.x_train)
+        x_train_pca = pca.transform(self.x_train)
+        x_test_pca = pca.transform(self.x_test)
+        self.transformations = pca
+
+        train_X_preprocessed = normalize(x_train_pca)
+        test_X_preprocessed = normalize(x_test_pca)
+
+        self.train_Y_preprocessed = np.array(self.y_train.values[:,0] * 2 - np.ones(len(self.y_train.values[:,0])), requires_grad = False)  # shift label from {0, 1} to {-1, 1}
+        self.train_X_preprocessed = np.array(train_X_preprocessed, requires_grad=False)
+
+        self.test_Y_preprocessed = np.array(self.y_test.values[:,0] * 2 - np.ones(len(self.y_test.values[:,0])), requires_grad = False)  # shift label from {0, 1} to {-1, 1}
+        self.test_X_preprocessed = np.array(test_X_preprocessed, requires_grad=False)
+
+    def perform_normalize(self, n_dim = 2):
+
+        self.preprocess_done = "Normalize"
+        print("Performing Normalize...")
+
+        self.y_train.value_counts(normalize=True)*100
+        self.y_test.value_counts(normalize=True)*100
+
+        self.x_train.value_counts(normalize=True)*100
+        self.x_test.value_counts(normalize=True)*100
diff --git a/QML4FRAUD/Factory/model.py b/QML4FRAUD/Factory/model.py
@@ -0,0 +1,119 @@
+import pandas as pd
+import numpy as np
+from pennylane import numpy as np
+from sklearn.preprocessing import normalize
+from sklearn.preprocessing import StandardScaler
+
+import pennylane as qml
+#from pennylane_qiskit import IBMQDevice
+#from pennylane_qiskit import BasicAerDevice
+from pennylane.templates.embeddings import AngleEmbedding, AmplitudeEmbedding
+from pennylane.optimize import AdamOptimizer
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from sklearn import metrics
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
+
+import time
+
+class QBC:
+    def __init__(self, data, n_dim, n_layers, 
+                 optimizer = AdamOptimizer(stepsize=0.1, beta1=0.9, beta2=0.99, eps=1e-08),
+                 interface_type = "autograd",
+                 loss_function = None,  backend = "default.qubit", shots = None):
+
+        if loss_function == None:
+            def square_loss(labels, predictions):
+                loss = 0
+                for l, p in zip(labels, predictions):
+                    loss = loss + (l - p) ** 2
+
+                loss = loss / len(labels)
+                return loss
+            self.loss_function = square_loss
+        else: 
+            self.loss_function = loss_function
+        self.opt = optimizer
+        self.data = data
+
+        self.n_dim = n_dim
+
+        dev = qml.device(backend, wires = self.n_dim, shots=shots)
+        #dev = qml.device('default.qubit.tf', wires = num_qubits, shots=1024)
+        #dev = qml.device('qiskit.ibmq', wires = num_qubits, backend='ibmq_manila', ibmqx_token="6cc75c58fc80fea56cb8dd391f8fbcfdb676a3dc7005493728bc9da7ea753e31a2110a01e3a0cc83f1a98f5ca79e32956fc66c11b5eea4cae163b3fa996be356", shots=256)
+        #dev = qml.device('qiskit.basicaer', wires = num_qubits, shots = 256)
+
+        @qml.qnode(dev)
+        def circuit(parameters, data):
+            for i in range(n_dim):
+                qml.Hadamard(wires = i)
+
+            AngleEmbedding(features = data, wires = range(self.n_dim), rotation = 'Y')
+
+            qml.StronglyEntanglingLayers(weights = parameters, wires = range(self.n_dim))
+
+            return qml.expval(qml.PauliZ(0))
+
+        self.qlayer = qml.QNode(circuit, dev, interface=interface_type, diff_method='best')
+
+        self.n_layers = n_layers
+        self.weights = 0.01 * np.random.randn(self.n_layers, self.n_dim, 3, requires_grad=True)
+        self.bias = np.array(0.0, requires_grad=True)
+
+    def variational_classifier(self, weights, bias, x):
+        return self.qlayer(weights, x) + bias
+
+    def train(self, batch_size = 10, n_epochs = 50):
+        wbest = 0
+        bbest = 0
+        abest = 0
+        X, Y, _, _ = self.data.get_preprocessed()
+
+        def cost(weights, bias, X, Y):
+            predictions = [self.variational_classifier(weights, bias, x) for x in X]
+            return self.loss_function(Y, predictions)
+
+        def accuracy(labels, predictions):
+
+            loss = 0
+            for l, p in zip(labels, predictions):
+                if abs(l - p) < 1e-5:
+                    loss = loss + 1
+            loss = loss / len(labels)
+
+            return loss
+
+        for it in range(n_epochs):
+
+            # weights update by one optimizer step
+
+            batch_index = np.random.randint(0, len(X), (batch_size,))
+            X_batch = X[batch_index]
+            Y_batch = Y[batch_index]
+            self.weights, self.bias, _, _ = self.opt.step(cost, self.weights, self.bias, X_batch, Y_batch)
+
+            # Compute the accuracy
+            predictions = [np.sign(self.variational_classifier(self.weights, self.bias, x)) for x in X]
+
+            if accuracy(Y, predictions) > abest:
+                wbest = self.weights
+                bbest = self.bias
+                abest = accuracy(Y, predictions)
+                print('New best')
+
+            acc = accuracy(Y, predictions)
+
+            print(
+                "Iter: {:5d} | Cost: {:0.7f} | Accuracy: {:0.7f} ".format(
+                    it + 1, cost(self.weights, self.bias, X, Y), acc
+                )
+            )
+
+        self.weights = wbest
+        self.bias = bbest
+
+    def predict(self, test_data):
+        predictions = [np.sign(self.variational_classifier(self.weights, self.bias, x)) for x in test_data]
+        return predictions