-
Notifications
You must be signed in to change notification settings - Fork 1
/
documentation.Rmd
217 lines (172 loc) · 9.74 KB
/
documentation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
Loading and clearing the working environment.
```{r Preparation of the environment, warning=FALSE}
cat("\014")
rm(list = ls())
library(dplyr)
library(ggplot2)
library(kohonen)
```
Data conversion (normalization / processing) and subsequent saving to .RData file.
```{r Loading the data + data preprocessing, warning=FALSE, echo=FALSE, message=FALSE, results='hide'}
dataset <- read.csv("./data/dataset.csv",
header = TRUE,
sep = ",",
stringsAsFactors = FALSE)
# conversion to numeric type
for (i in 1:5){
dataset[,i] <- as.numeric(dataset[,i])
}
# predicting missing values with linear model
for(i in 1:5){
datasetok <- switch(i,dataset[!is.na(dataset$BI.RADS),],
dataset[!is.na(dataset$Age),],
dataset[!is.na(dataset$Shape),],
dataset[!is.na(dataset$Margin),],
dataset[!is.na(dataset$Density),])
datasetna <- switch(i,dataset[is.na(dataset$BI.RADS),],
dataset[is.na(dataset$Age),],
dataset[is.na(dataset$Shape),],
dataset[is.na(dataset$Margin),],
dataset[is.na(dataset$Density),])
model <- switch(i,lm(BI.RADS ~ Severity,data = datasetok,na.action=na.omit),
lm(Age ~ Severity*(BI.RADS),data = datasetok,na.action=na.omit),
lm(Shape ~ Severity*(Age+BI.RADS),data = datasetok,na.action=na.omit),
lm(Margin ~ Severity*(Age+Shape+BI.RADS),data = datasetok,na.action=na.omit),
lm(Density ~ Severity*(Age+Shape+Margin+BI.RADS),data = datasetok,na.action=na.omit))
summary(model)
dataset[is.na(dataset[,i]),i] <- round(predict(model,datasetna))
}
# division of the data set into malignant and benign lesions
Malignant <- dataset[dataset$Severity == 1,]
Benign <- dataset[dataset$Severity == 0,]
Benign <- Benign[!(Benign$BI.RADS < 1 | Benign$BI.RADS > 5 | Benign$Shape < 1 | Benign$Shape > 4
| Benign$Margin < 1 | Benign$Margin > 5 | Benign$Density < 1 | Benign$Density > 4),]
Malignant <- Malignant[!(Malignant$BI.RADS < 1 | Malignant$BI.RADS > 5 | Malignant$Shape < 1 | Malignant$Shape > 4
| Malignant$Margin < 1 | Malignant$Margin > 5 | Malignant$Density < 1 | Malignant$Density > 4),]
numberOfMalignant <- nrow(Malignant)
numberOfBenign <- nrow(Benign)
# division into two different sets: test and training data
dataset <- rbind(Malignant,Benign)
trainingData <- rbind(Malignant[1:round(dim(Malignant)[1]/2),],Benign[1:round(dim(Benign)[1]/2),])
testingData <- rbind(Malignant[(round(dim(Malignant)[1]/2)+1):dim(Malignant)[1],],Benign[(round(dim(Benign)[1]/2)+1):dim(Benign)[1],])
# editing and correcting the names of individual lines
rownames(testingData) <- cat(1:dim(testingData)[1])
rownames(trainingData) <- cat(1:dim(trainingData)[1])
# data normalization (in the range from 0 to 1)
for (i in 1:5){
trainingData[,i] <- (trainingData[,i] - min(trainingData[,i]))/(max(trainingData[,i])-min(trainingData[,i]))
testingData[,i] <- (testingData[,i] - min(testingData[,i]))/(max(testingData[,i])-min(testingData[,i]))
}
# saving the data for faster loading and optimal application operation
save(dataset,testingData,trainingData,Malignant,Benign, file="./data/processed_dataset.RData")
```
Creation of histograms about dataset. Division into malignant / benign decision classes.
```{r Histograms - distribution of features in both classes}
ggplot(Malignant,aes(BI.RADS, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) +
geom_histogram(data = Benign,mapping = aes(BI.RADS, fill = "Benign"),bins = 4, alpha = 0.5) +
ggtitle("Histogram of BI.RADS values") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))
ggplot(Malignant,aes(Age, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) +
geom_histogram(data = Benign,mapping = aes(Age, fill = "Benign"),bins = 4, alpha = 0.5) +
ggtitle("Histogram of Age values") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))
ggplot(Malignant,aes(Shape, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) +
geom_histogram(data = Benign,mapping = aes(Shape, fill = "Benign"),bins = 4, alpha = 0.5) +
ggtitle("Histogram of Shape values") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))
ggplot(Malignant,aes(Margin, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) +
geom_histogram(data = Benign,mapping = aes(Margin, fill = "Benign"),bins = 4, alpha = 0.5) +
ggtitle("Histogram of Margin values") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))
ggplot(Malignant,aes(Density, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) +
geom_histogram(data = Benign,mapping = aes(Density, fill = "Benign"),bins = 4, alpha = 0.5) +
ggtitle("Histogram of Density values") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))
ggplot(Malignant,aes(Severity, fill = "Malignant")) + geom_histogram(bins = 4, alpha = 0.5) +
geom_histogram(data = Benign,mapping = aes(Severity, fill = "Benign"),bins = 4, alpha = 0.5) +
ggtitle("Histogram of Severity values") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(name="diagnosed class",values=c(Malignant="red",Benign="blue"))
```
Setting the basic parameters of the SOM network in order to test its operation. The following code is the engine of the Shiny application and shows how the mesh is created and then its model used in the predict function.
```{r Parameter setting for SOM}
# create data that will be marked separately
trainingdata <- list(severity = as.matrix(trainingData[,6]),measurements = as.matrix(trainingData[,1:5]))
# set the appropriate SOM parameters
gridNumOfRow <- 5
gridNumOfCol <- 5
numberOfIterations <- 1000
learningRate <- c(0.05,0.001)
topology <- c("rectangular", "hexagonal")
data_train_matrix <- as.matrix(trainingData)
som_grid <- somgrid(xdim = gridNumOfRow, ydim=gridNumOfCol, topo=topology[2], neighbourhood.fct = "gaussian")
som_model <- supersom(trainingdata,
grid=som_grid,
rlen=numberOfIterations,
alpha=learningRate,
keep.data = TRUE)
```
Creating graphs of mesh classification results showing its correctness and mainly the distribution of ignited neurons. The charts are intended to show how quickly the network learns and ultimately how the change of parameters in the Shiny application affects the quality of its classification.
```{r SOM Charts}
# Training progress
plot(som_model, type="changes",main="Training Progress")
# Node counts
plot(som_model, type="count",main="The node counts")
# Node maps
plot(som_model, type="mapping",main="Mapping of the activated nodes")
# Node quality
plot(som_model, type="quality",main="The quality of the mapping")
# Neighbor distance
plot(som_model, type="dist.neighbours",main="SOM neighbour distances")
# Node codes
plot(som_model, type="codes",main="Codes / Weight vectors view")
# setting the coloring function for cluster plot
coolBlueHotRed <- function(n, alpha = 1) {rainbow(n, end=4/6, alpha=alpha)[n:1]}
pretty_palette <- c("#1f77b4","#ff7f0e","#2ca02c", "#d62728","#9467bd","#8c564b","#e377c2")
c <- som_model$codes
som_cluster <- cutree(hclust(dist(som_model$codes[[1]])), 2)
# chosen clusters
plot(som_model, type="mapping", bgcol = pretty_palette[som_cluster],main = "Classification - division into clusters")
add.cluster.boundaries(som_model, som_cluster)
```
Performing simple tests using the prediction function. The results are presented in the form of sensitivity, specificity and truth table. Print the sensitivity and specificity in percent on the consoles.
```{r SOM network test - on unknown data (testing data) + Sensitivity + Specificity + Truth table}
# predict results with testing data set
testingdata <- list(measurements = as.matrix(testingData[,1:5]))
som.prediction <- predict(som_model, newdata = testingdata)
resultsTable <- table(testingData[,6],som.prediction$predictions[["severity"]])
score <- sum(diag(resultsTable)) # <---- determinant of truth table
truthTable <- resultsTable # <---- Truth table
cat("Tablica prawdy:")
print(truthTable)
predictions <- som.prediction$predictions[["severity"]] # <---- results for test set
# count sensitivity and specificity from truth table
sensitivity <- truthTable[2,2] / (truthTable[2,2] + truthTable[1,2])
specificity <- truthTable[1,1] / (truthTable[1,1] + truthTable[2,1])
# print sensitivity and specificity
cat(paste0("\nSensitivity: ", signif(sensitivity*100,digits=4),"%"))
cat(paste0("\nSpecificity: ", signif(specificity*100,digits=4),"%"))
```
Performing simple tests using the prediction function, this time for user-inputted data. After the analysis using the predict function, the proposed diagnosis of the SOM network was presented for the previously determined model.
```{r Test results for user input}
# insert allowed values from the acceptable range for the given classes (BI.RADS,Age,Shape,Margin,Density)
userdata <- c(4,28,1,1,3)
userdatamatrix <- matrix(userdata,nrow=1,ncol=5,byrow=TRUE)
# data must be normalized before prediction
for (i in 1:5){
userdatamatrix[1,i] <- (userdatamatrix[1,i]-min(dataset[,i]))/(max(dataset[,i])-min(dataset[,i]))
}
customUserdata <- list(measurements = userdatamatrix)
som.predictionUser <- predict(som_model, newdata = customUserdata)
# print predictions
if (som.predictionUser$predictions[["severity"]][1] == 1){
cat('The detected lesion is malignant\n')
} else{
cat('The detected lesion is benign\n')
}
```