documentation.Rmd

Loading and clearing the working environment.
```{r Preparation of the environment, warning=FALSE}
cat("\014")
rm(list = ls())
library(dplyr)
library(ggplot2)
library(kohonen)

```

Data conversion (normalization / processing) and subsequent saving to .RData file.
```{r Loading the data + data preprocessing, warning=FALSE, echo=FALSE, message=FALSE, results='hide'}

dataset <- read.csv("./data/dataset.csv", 
                        header = TRUE, 
                        sep = ",",
                        stringsAsFactors = FALSE)

# conversion to numeric type
for (i in 1:5){
  dataset[,i] <- as.numeric(dataset[,i])
}


# predicting missing values with linear model
for(i in 1:5){
  datasetok <- switch(i,dataset[!is.na(dataset$BI.RADS),],
                      dataset[!is.na(dataset$Age),],
                      dataset[!is.na(dataset$Shape),],
                      dataset[!is.na(dataset$Margin),],
                      dataset[!is.na(dataset$Density),])
  datasetna <- switch(i,dataset[is.na(dataset$BI.RADS),],
                      dataset[is.na(dataset$Age),],
                      dataset[is.na(dataset$Shape),],
                      dataset[is.na(dataset$Margin),],
                      dataset[is.na(dataset$Density),])
  model <- switch(i,lm(BI.RADS ~ Severity,data = datasetok,na.action=na.omit),
                  lm(Age ~ Severity*(BI.RADS),data = datasetok,na.action=na.omit),
                  lm(Shape ~ Severity*(Age+BI.RADS),data = datasetok,na.action=na.omit),
                  lm(Margin ~ Severity*(Age+Shape+BI.RADS),data = datasetok,na.action=na.omit),
                  lm(Density ~ Severity*(Age+Shape+Margin+BI.RADS),data = datasetok,na.action=na.omit))
  summary(model)
  dataset[is.na(dataset[,i]),i] <- round(predict(model,datasetna))
}

# division of the data set into malignant and benign lesions
Malignant <- dataset[dataset$Severity == 1,]
Benign <- dataset[dataset$Severity == 0,]

Benign <- Benign[!(Benign$BI.RADS < 1 | Benign$BI.RADS > 5  | Benign$Shape < 1  | Benign$Shape > 4
                  | Benign$Margin < 1  | Benign$Margin > 5  | Benign$Density < 1  | Benign$Density > 4),]
Malignant <- Malignant[!(Malignant$BI.RADS < 1 | Malignant$BI.RADS > 5  | Malignant$Shape < 1  | Malignant$Shape > 4
                  | Malignant$Margin < 1  | Malignant$Margin > 5  | Malignant$Density < 1  | Malignant$Density > 4),]

numberOfMalignant <- nrow(Malignant)
numberOfBenign <- nrow(Benign)

# division into two different sets: test and training data
dataset <- rbind(Malignant,Benign)
trainingData <- rbind(Malignant[1:round(dim(Malignant)[1]/2),],Benign[1:round(dim(Benign)[1]/2),])
testingData <- rbind(Malignant[(round(dim(Malignant)[1]/2)+1):dim(Malignant)[1],],Benign[(round(dim(Benign)[1]/2)+1):dim(Benign)[1],])

# editing and correcting the names of individual lines
rownames(testingData) <- cat(1:dim(testingData)[1])
rownames(trainingData) <- cat(1:dim(trainingData)[1])

# data normalization (in the range from 0 to 1)
for (i in 1:5){
  trainingData[,i] <- (trainingData[,i] - min(trainingData[,i]))/(max(trainingData[,i])-min(trainingData[,i]))
  testingData[,i] <- (testingData[,i] - min(testingData[,i]))/(max(testingData[,i])-min(testingData[,i]))
}

# saving the data for faster loading and optimal application operation
save(dataset,testingData,trainingData,Malignant,Benign, file="./data/processed_dataset.RData")

```

Creation of histograms about dataset. Division into malignant / benign decision classes.
```{r Histograms - distribution of features in both classes}

ggplot(Malignant,aes(BI.RADS, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) + 
  geom_histogram(data = Benign,mapping = aes(BI.RADS, fill = "Benign"),bins = 4, alpha = 0.5) +
  ggtitle("Histogram of BI.RADS values") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))

ggplot(Malignant,aes(Age, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) + 
  geom_histogram(data = Benign,mapping = aes(Age, fill = "Benign"),bins = 4, alpha = 0.5) +
  ggtitle("Histogram of Age values") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))

ggplot(Malignant,aes(Shape, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) + 
  geom_histogram(data = Benign,mapping = aes(Shape, fill = "Benign"),bins = 4, alpha = 0.5) +
  ggtitle("Histogram of Shape values") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))

ggplot(Malignant,aes(Margin, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) + 
  geom_histogram(data = Benign,mapping = aes(Margin, fill = "Benign"),bins = 4, alpha = 0.5) +
  ggtitle("Histogram of Margin values") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))

ggplot(Malignant,aes(Density, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) + 
  geom_histogram(data = Benign,mapping = aes(Density, fill = "Benign"),bins = 4, alpha = 0.5) +
  ggtitle("Histogram of Density values") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))

ggplot(Malignant,aes(Severity, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) + 
  geom_histogram(data = Benign,mapping = aes(Severity, fill = "Benign"),bins = 4, alpha = 0.5) +
  ggtitle("Histogram of Severity values") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))

```

Setting the basic parameters of the SOM network in order to test its operation. The following code is the engine of the Shiny application and shows how the mesh is created and then its model used in the predict function.
```{r Parameter setting for SOM}

# create data that will be marked separately
trainingdata <- list(severity = as.matrix(trainingData[,6]),measurements = as.matrix(trainingData[,1:5]))

# set the appropriate SOM parameters
gridNumOfRow <- 5
gridNumOfCol <- 5
numberOfIterations <- 1000
learningRate <- c(0.05,0.001)
topology <- c("rectangular", "hexagonal")
data_train_matrix <- as.matrix(trainingData)
som_grid <- somgrid(xdim = gridNumOfRow, ydim=gridNumOfCol, topo=topology[2], neighbourhood.fct = "gaussian")
som_model <- supersom(trainingdata, 
		grid=som_grid, 
		rlen=numberOfIterations, 
		alpha=learningRate, 
		keep.data = TRUE)

```

Creating graphs of mesh classification results showing its correctness and mainly the distribution of ignited neurons. The charts are intended to show how quickly the network learns and ultimately how the change of parameters in the Shiny application affects the quality of its classification.
```{r SOM Charts}

# Training progress
plot(som_model, type="changes",main="Training Progress")

# Node counts
plot(som_model, type="count",main="The node counts")

# Node maps
plot(som_model, type="mapping",main="Mapping of the activated nodes")

# Node quality
plot(som_model, type="quality",main="The quality of the mapping")

# Neighbor distance
plot(som_model, type="dist.neighbours",main="SOM neighbour distances")

# Node codes
plot(som_model, type="codes",main="Codes / Weight vectors view")

# setting the coloring function for cluster plot
coolBlueHotRed <- function(n, alpha = 1) {rainbow(n, end=4/6, alpha=alpha)[n:1]}
pretty_palette <- c("#1f77b4","#ff7f0e","#2ca02c", "#d62728","#9467bd","#8c564b","#e377c2")
c <- som_model$codes 
som_cluster <- cutree(hclust(dist(som_model$codes[[1]])), 2)
# chosen clusters
plot(som_model, type="mapping", bgcol = pretty_palette[som_cluster],main = "Classification - division into clusters")
add.cluster.boundaries(som_model, som_cluster)

```

Performing simple tests using the prediction function. The results are presented in the form of sensitivity, specificity and truth table. Print the sensitivity and specificity in percent on the consoles.
```{r SOM network test - on unknown data (testing data) + Sensitivity + Specificity + Truth table}

# predict results with testing data set
testingdata <- list(measurements = as.matrix(testingData[,1:5]))
som.prediction <- predict(som_model, newdata = testingdata)
resultsTable <- table(testingData[,6],som.prediction$predictions[["severity"]])

score <- sum(diag(resultsTable)) # <---- determinant of truth table
truthTable <- resultsTable # <---- Truth table
cat("Tablica prawdy:")
print(truthTable)
predictions <- som.prediction$predictions[["severity"]] #  <---- results for test set

# count sensitivity and specificity from truth table
sensitivity <- truthTable[2,2] / (truthTable[2,2] + truthTable[1,2])
specificity <- truthTable[1,1] / (truthTable[1,1] + truthTable[2,1])

# print sensitivity and specificity
cat(paste0("\nSensitivity: ", signif(sensitivity*100,digits=4),"%"))
cat(paste0("\nSpecificity: ", signif(specificity*100,digits=4),"%"))

```

Performing simple tests using the prediction function, this time for user-inputted data. After the analysis using the predict function, the proposed diagnosis of the SOM network was presented for the previously determined model.
```{r Test results for user input}
# insert allowed values from the acceptable range for the given classes (BI.RADS,Age,Shape,Margin,Density)
userdata <- c(4,28,1,1,3)
userdatamatrix <- matrix(userdata,nrow=1,ncol=5,byrow=TRUE)

# data must be normalized before prediction
for (i in 1:5){
  userdatamatrix[1,i] <- (userdatamatrix[1,i]-min(dataset[,i]))/(max(dataset[,i])-min(dataset[,i]))
}
customUserdata <- list(measurements = userdatamatrix)
som.predictionUser <- predict(som_model, newdata = customUserdata)

# print predictions
if (som.predictionUser$predictions[["severity"]][1] == 1){
  cat('The detected lesion is malignant\n')
} else{
  cat('The detected lesion is benign\n')
}

```