diff --git a/botiverse/models/LSTM/LSTM.ipynb b/botiverse/models/LSTM/LSTM.ipynb
index cab18f9..9fbcd11 100644
--- a/botiverse/models/LSTM/LSTM.ipynb
+++ b/botiverse/models/LSTM/LSTM.ipynb
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -49,6 +49,7 @@
     "from torch.autograd import Variable\n",
     "import numpy as np\n",
     "from tqdm import tqdm\n",
+    "import os\n",
     "\n",
     "class LSTMCell(nn.Module): \n",
     "    '''\n",
@@ -111,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -178,7 +179,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -194,34 +195,96 @@
     "        self.criterion = nn.CrossEntropyLoss()\n",
     "\n",
     "    def forward(self, x):\n",
+    "        '''\n",
+    "        Forward pass of the LSTMClassifier which takes the input and passes it through all the LSTM layers and an output layer to produce an output.\n",
+    "        :param x: The input to the LSTMClassifier which is of shape (batch_size, seq_len, input_size)\n",
+    "        :return: The output of the LSTMClassifier which is of shape (batch_size, num_classes)\n",
+    "        '''\n",
     "        out = self.lstm(x)\n",
     "        out = self.fc(out)\n",
     "        return out\n",
     "    \n",
-    "    def fit(self, X, y, hidden_size=64, λ=0.001, num_epochs=100, val_size=0.0):\n",
+    "    \n",
+    "    def fit(self, X, y, λ=0.001, α=1e-3, max_epochs=100, patience=5, val_ratio=0.2):\n",
+    "        '''\n",
+    "        Fit the LSTMClassifier to the given data.\n",
+    "        :param X: The input data of shape (batch_size, seq_len, input_size)\n",
+    "        :param y: The labels of the data of shape (batch_size)\n",
+    "        :param hidden_size: The size of the hidden state of the LSTM layer (default: 64)\n",
+    "        :param λ: The learning rate (default: 0.001)\n",
+    "        :param num_epochs: The number of epochs to train the model for (default: 100)\n",
+    "        '''\n",
     "        Xt = torch.from_numpy(X)\n",
     "        yt = torch.from_numpy(y)\n",
+    "        if val_ratio:\n",
+    "            indices = torch.randperm(len(Xt))\n",
+    "            Xt, yt = Xt[indices], yt[indices]\n",
+    "            # split the data into train and validation sets\n",
+    "            val_size = int(val_ratio * len(Xt))\n",
+    "            Xt, Xv = Xt[:-val_size], Xt[-val_size:]\n",
+    "            yt, yv = yt[:-val_size], yt[-val_size:]\n",
     "        \n",
-    "        optimizer = torch.optim.Adam(self.parameters(), lr=λ)\n",
-    "        pbar = tqdm(range(num_epochs))\n",
+    "                \n",
+    "        optimizer = torch.optim.Adam(self.parameters(), lr=λ, weight_decay=α)\n",
+    "        print(\"Training the LSTMClassifier...\")\n",
+    "        curr_dir = os.path.dirname(os.path.realpath(__file__))\n",
+    "        bad_epochs = 0\n",
+    "        val_accuracy = 0\n",
+    "        val_loss = 0\n",
+    "        best_loss = np.inf\n",
+    "        pbar = tqdm(range(max_epochs))\n",
     "        for epoch in pbar:\n",
     "            outputs = self(Xt)\n",
     "            loss = self.criterion(outputs.squeeze(), yt)\n",
-    "            pbar.set_description(f\"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}\")\n",
+    "            pbar.set_description(f\"Epoch {epoch+1}/{max_epochs}, Loss: {loss.item()}\")\n",
     "            \n",
     "            optimizer.zero_grad()\n",
     "            loss.backward()\n",
     "            optimizer.step()\n",
+    "            if val_ratio:\n",
+    "                # randomly shuffle the data\n",
+    "                val_accuracy = self.evaluate(Xv, yv)\n",
+    "                with torch.no_grad():\n",
+    "                    val_loss = self.criterion(self(Xv).squeeze(), yv)\n",
+    "                if val_loss < best_loss:\n",
+    "                    best_loss = val_loss\n",
+    "                    bad_epochs = 0\n",
+    "                    # save the model\n",
+    "                    torch.save(self.state_dict(), os.path.join(curr_dir, \"LSTMClassifier.pt\"))\n",
+    "                else:\n",
+    "                    bad_epochs += 1\n",
+    "                    if bad_epochs == patience:\n",
+    "                        print(f\"{patience} epochs have passed without improvement. Early stopping...\")\n",
+    "                        self.load_state_dict(torch.load(os.path.join(curr_dir, \"LSTMClassifier.pt\")))\n",
+    "                        break\n",
+    "                # every 5 epochs see\n",
+    "                pbar.set_postfix({\"Validation Accuracy\": val_accuracy})             \n",
+    "           \n",
     "\n",
     "    def predict(self, X):\n",
+    "        '''\n",
+    "        Predict the labels of the given data by passing it through the LSTMClassifier.\n",
+    "        :param X: The input data of shape (batch_size, seq_len, input_size)\n",
+    "        :return: The predicted labels of the data of shape (batch_size)\n",
+    "        '''\n",
     "        Xt = torch.from_numpy(X)\n",
     "        outputs = self(Xt)\n",
-    "        outputs = torch.argmax(outputs, dim=1)\n",
-    "        return outputs.detach().numpy()\n",
+    "        pred = torch.argmax(outputs, dim=1)\n",
+    "        softmax = nn.Softmax(dim=1)\n",
+    "        prob = torch.max(softmax(outputs), dim=1)\n",
+    "        return pred.detach().numpy(), prob.values.detach().numpy()\n",
     "    \n",
-    "    def evaluate(self, X, y):\n",
-    "        Xt = torch.from_numpy(X)\n",
-    "        yt = torch.from_numpy(y)\n",
+    "    def evaluate(self, Xt, yt):\n",
+    "        '''\n",
+    "        Evaluate the LSTMClassifier on the given data.\n",
+    "        :param X: The input data of shape (batch_size, seq_len, input_size)\n",
+    "        :param y: The labels of the data of shape (batch_size)\n",
+    "        :return: The accuracy of the LSTMClassifier on the given data\n",
+    "        '''\n",
+    "        # check ig they are torch tensors\n",
+    "        if not isinstance(Xt, torch.Tensor) or not isinstance(yt, torch.Tensor):\n",
+    "            Xt = torch.from_numpy(Xt)\n",
+    "            yt = torch.from_numpy(yt)\n",
     "        outputs = self(Xt)\n",
     "        outputs = torch.argmax(outputs, dim=1)\n",
     "        # compute the accuracy\n",
@@ -230,7 +293,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -238,7 +301,7 @@
      "output_type": "stream",
      "text": [
       "[NbConvertApp] Converting notebook LSTM.ipynb to script\n",
-      "[NbConvertApp] Writing 8093 bytes to LSTM.py\n"
+      "[NbConvertApp] Writing 11235 bytes to LSTM.py\n"
      ]
     }
    ],
diff --git a/botiverse/models/LSTM/LSTM.py b/botiverse/models/LSTM/LSTM.py
index ed44bd3..7640081 100644
--- a/botiverse/models/LSTM/LSTM.py
+++ b/botiverse/models/LSTM/LSTM.py
@@ -27,7 +27,7 @@
 # 
 # 
 
-# In[2]:
+# In[5]:
 
 
 import torch
@@ -36,6 +36,7 @@
 from torch.autograd import Variable
 import numpy as np
 from tqdm import tqdm
+import os
 
 class LSTMCell(nn.Module): 
     '''
@@ -90,7 +91,7 @@ def forward(self, input, h, c):
 # 
 # Given an input sequence, each token passes by all the layers and each layer has its own hidden state and cell state which is its output due to the previous token.
 
-# In[3]:
+# In[6]:
 
 
 class LSTMX(nn.Module):
@@ -154,7 +155,7 @@ def forward(self, input, hₒ=None):
         return outs[-1]
 
 
-# In[4]:
+# In[7]:
 
 
 class LSTMClassifier(nn.Module):
@@ -169,41 +170,103 @@ def __init__(self, input_size, hidden_size, num_classes):
         self.criterion = nn.CrossEntropyLoss()
 
     def forward(self, x):
+        '''
+        Forward pass of the LSTMClassifier which takes the input and passes it through all the LSTM layers and an output layer to produce an output.
+        :param x: The input to the LSTMClassifier which is of shape (batch_size, seq_len, input_size)
+        :return: The output of the LSTMClassifier which is of shape (batch_size, num_classes)
+        '''
         out = self.lstm(x)
         out = self.fc(out)
         return out
     
-    def fit(self, X, y, hidden_size=64, λ=0.001, num_epochs=100, val_size=0.0):
+    
+    def fit(self, X, y, λ=0.001, α=1e-3, max_epochs=100, patience=5, val_ratio=0.2):
+        '''
+        Fit the LSTMClassifier to the given data.
+        :param X: The input data of shape (batch_size, seq_len, input_size)
+        :param y: The labels of the data of shape (batch_size)
+        :param hidden_size: The size of the hidden state of the LSTM layer (default: 64)
+        :param λ: The learning rate (default: 0.001)
+        :param num_epochs: The number of epochs to train the model for (default: 100)
+        '''
         Xt = torch.from_numpy(X)
         yt = torch.from_numpy(y)
+        if val_ratio:
+            indices = torch.randperm(len(Xt))
+            Xt, yt = Xt[indices], yt[indices]
+            # split the data into train and validation sets
+            val_size = int(val_ratio * len(Xt))
+            Xt, Xv = Xt[:-val_size], Xt[-val_size:]
+            yt, yv = yt[:-val_size], yt[-val_size:]
         
-        optimizer = torch.optim.Adam(self.parameters(), lr=λ)
-        pbar = tqdm(range(num_epochs))
+                
+        optimizer = torch.optim.Adam(self.parameters(), lr=λ, weight_decay=α)
+        print("Training the LSTMClassifier...")
+        curr_dir = os.path.dirname(os.path.realpath(__file__))
+        bad_epochs = 0
+        val_accuracy = 0
+        val_loss = 0
+        best_loss = np.inf
+        pbar = tqdm(range(max_epochs))
         for epoch in pbar:
             outputs = self(Xt)
             loss = self.criterion(outputs.squeeze(), yt)
-            pbar.set_description(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
+            pbar.set_description(f"Epoch {epoch+1}/{max_epochs}, Loss: {loss.item()}")
             
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
+            if val_ratio:
+                # randomly shuffle the data
+                val_accuracy = self.evaluate(Xv, yv)
+                with torch.no_grad():
+                    val_loss = self.criterion(self(Xv).squeeze(), yv)
+                if val_loss < best_loss:
+                    best_loss = val_loss
+                    bad_epochs = 0
+                    # save the model
+                    torch.save(self.state_dict(), os.path.join(curr_dir, "LSTMClassifier.pt"))
+                else:
+                    bad_epochs += 1
+                    if bad_epochs == patience:
+                        print(f"{patience} epochs have passed without improvement. Early stopping...")
+                        self.load_state_dict(torch.load(os.path.join(curr_dir, "LSTMClassifier.pt")))
+                        break
+                # every 5 epochs see
+                pbar.set_postfix({"Validation Accuracy": val_accuracy})             
+           
 
     def predict(self, X):
+        '''
+        Predict the labels of the given data by passing it through the LSTMClassifier.
+        :param X: The input data of shape (batch_size, seq_len, input_size)
+        :return: The predicted labels of the data of shape (batch_size)
+        '''
         Xt = torch.from_numpy(X)
         outputs = self(Xt)
-        outputs = torch.argmax(outputs, dim=1)
-        return outputs.detach().numpy()
+        pred = torch.argmax(outputs, dim=1)
+        softmax = nn.Softmax(dim=1)
+        prob = torch.max(softmax(outputs), dim=1)
+        return pred.detach().numpy(), prob.values.detach().numpy()
     
-    def evaluate(self, X, y):
-        Xt = torch.from_numpy(X)
-        yt = torch.from_numpy(y)
+    def evaluate(self, Xt, yt):
+        '''
+        Evaluate the LSTMClassifier on the given data.
+        :param X: The input data of shape (batch_size, seq_len, input_size)
+        :param y: The labels of the data of shape (batch_size)
+        :return: The accuracy of the LSTMClassifier on the given data
+        '''
+        # check ig they are torch tensors
+        if not isinstance(Xt, torch.Tensor) or not isinstance(yt, torch.Tensor):
+            Xt = torch.from_numpy(Xt)
+            yt = torch.from_numpy(yt)
         outputs = self(Xt)
         outputs = torch.argmax(outputs, dim=1)
         # compute the accuracy
         return (outputs == yt).sum().item() / len(yt)
 
 
-# In[5]:
+# In[8]:
 
 
 # if running from notebook
diff --git a/botiverse/models/LSTM/LSTMClassifier.pt b/botiverse/models/LSTM/LSTMClassifier.pt
new file mode 100644
index 0000000..451983d
Binary files /dev/null and b/botiverse/models/LSTM/LSTMClassifier.pt differ