-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
TestCode.py
423 lines (302 loc) · 16.5 KB
/
TestCode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
# -*- coding: utf-8 -*-
"""diamondValuationEnglish.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1N2ahJEU06ftfN0SpPmxtXd83XkMdI1Rd
"""
# This iImporting libraries and loading the Dataframes formatted as code
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
import random
# Change the path of the database
path = r"/content/drive/MyDrive/Diamond/REPO/Database/Diamonds_values_faltantes.csv"
diamonds = pd.read_csv(fr"{path}")
diamonds
#Visualization of linear correlation coefficient and data separation for better implementation of KNN.
#Number of missing values per column
counter = {}
for x in range(diamonds.shape[1]):
column_name = diamonds.columns[x]
counter[column_name] = diamonds.shape[0] - len(diamonds[column_name].dropna())
counter_df = pd.DataFrame(list(counter.items()), columns=['Coluna', 'Quantidade de NaN'])
counter_df
#Heatmap visualization of linear correlation coefficient
plt.figure(figsize = (8, 6))
sns.heatmap((diamonds[["carat", "depth", "table", "price", "x", "y", "z"]]).corr(), vmin = -1, vmax = 1, annot = True, cmap = 'magma')
plt.title("Coeficiênte de correlação linear")
plt.show()
"""Analysis of the heatmap based on price:
We can conclude that the price doesn't have a good correlation with the total percentage of the diamond (depth) and also doesn't have a high correlation with the table, being an inversely proportional correlation of -0.0086 with the depth, and a proportional relationship of 0.13 with the table.
We can also conclude that the price has a good linear correlation with carat (carat) of 0.92, x (length) of 0.89, y (width) of 0.89 and z (depth) of 0.88.
Based on this heatmap analysis, we can conclude that the larger the carat (carat), x (length), y (width) and z (depth), the higher the price of the diamond may be.
However, there may be some cases where a diamond has a very high carat but a low price, just as there may be diamonds with a low carat but a high price. This can also happen with x (length), y (width) and z (depth). To address this, we need to analyze the Coefficient of Determination.
"""
#Heatmap visualization of Coefficient of Determination
plt.figure(figsize = (8, 6))
sns.heatmap((diamonds[["carat", "depth", "table", "price", "x", "y", "z"]]).corr()**2, vmin = -1, vmax = 1, annot = True, cmap = 'magma')
plt.title("Coefficient of Determination")
plt.show()
"""Analysis of the heatmap based on price:
Analyzing the heatmap above, we can see that we can define the price of the diamond with greater reliability using the numerical variable carat (carat), with a reliability of 85%. This means that even though we can say that the larger the carat of the diamond, the higher its price, unfortunately this rule is only actually valid for 85% of the data.
For x (length), y (width) and z (depth), this reliability is only 79% for length and width, and 78% for depth, which is not a strong determination. Therefore, these features can be disregarded if categorical variables can define the price of the diamond with precision.
We will now perform the process of separating the diamonds database to make the machine learning process more effective.
Cut: 5 classifications: Ideal, Premium, Good, Very Good and Fair
Color: 7 classifications: E, I, J, H, F, G and D
Clarity: 8 classifications: SI2, SI1, VS1, VS2, VVS2, VVS1, I1 and IF
"""
#Implementing K-NN
#Setting measurements equal to 0 length, width and/or depth of a diamond as NaN
for x in range(diamonds.shape[0]):
for y in range(7, diamonds.shape[1]):
if diamonds.iloc[x, y] == 0: diamonds.iloc[x, y] = np.nan
elif diamonds.iloc[x, y] >= 30: diamonds.iloc[x, y] = np.nan
diamonds
# Implementing K-NN on numerical columns
'''Some books advise using the formula (K = log n) where n is the number of rows in the database.
To thus define the amount of K.'''
classificacao = KNNImputer(n_neighbors = round(math.log(diamonds.shape[0])))
diamonds[["carat", "depth", "table", "price", "x", "y", "z"]] = classificacao.fit_transform(diamonds[["carat", "depth", "table", "price", "x", "y", "z"]])
diamonds
# Applying K-NN for categorical columns
encoder = OrdinalEncoder()
diamonds_encoder = encoder.fit_transform(diamonds)
knn_imputer = KNNImputer(n_neighbors = round(math.log(diamonds.shape[0])))
diamonds_imputer = knn_imputer.fit_transform(diamonds_encoder)
diamonds_imputer = pd.DataFrame(diamonds_imputer, columns = diamonds.columns)
diamonds_imputer = encoder.inverse_transform(diamonds_imputer)
# Substituindo os valores faltantes na base de dados diamonds principal
for x in range(diamonds.shape[0]):
for y in range(1, 4):
if pd.isna(diamonds.iloc[x, y]): diamonds.iloc[x, y] = diamonds_imputer[x][y]
diamonds
# Normalizing numerical columns
##standardization of numerical columns
diamonds[["carat", "x", "y", "z"]] = round(diamonds[["carat", "x", "y", "z"]], 2)
diamonds[["table", "price"]] = round(diamonds[["table", "price"]])
diamonds["depth"] = round(diamonds["depth"], 1)
diamonds
# Saving the cleaned database without missing values
path = r"DataBases\Diamonds_limpa.csv"
try:
pd.read_csv(f"{path}")
print(f"This dataframe already exists in the directory: {path}")
except FileNotFoundError:
diamonds.to_csv(fr"{path}", index = False)
print(f'''Cleaned database added to directory:
{path}
successfully!!''')
"""# Analysis of the price relationship of the numerical columns
IMPORTANT INFORMATION:
1 Carat is equivalent to 200mg
1 Point is equivalent to 0.01 carats
# Comparing the relationship of diamond length with carat and price
"""
plt.figure(figsize=(17, 10))
plt.subplot(2, 1, 1)
sns.scatterplot(data=diamonds, x = "x", y = "price")
plt.xlabel("Length (mm)")
plt.ylabel("Price")
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis = "y", alpha = 0.5)
plt.subplot(2, 1, 2)
sns.scatterplot(data=diamonds, x = "x", y = "carat")
plt.xlabel("Length (mm)")
plt.ylabel("Carat")
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis = "y", alpha = 0.5)
plt.show()
"""# Comparing the relationship of diamond width with carat and price
"""
plt.figure(figsize=(17, 10))
plt.subplot(2, 1, 1)
sns.scatterplot(diamonds, x = "y", y = "price")
plt.xlabel("Width (mm)")
plt.ylabel("Price")
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis = "y", alpha = 0.5)
plt.subplot(2, 1, 2)
sns.scatterplot(diamonds, x = "y", y = "carat")
plt.xlabel("Width (mm)")
plt.ylabel("Carat")
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis = "y", alpha = 0.5)
plt.show()
"""# Comparing the relationship of diamond depth with carat and price
"""
plt.figure(figsize=(17, 10))
plt.subplot(2, 1, 1)
sns.scatterplot(diamonds, x = "z", y = "price")
plt.xlabel("Depth (mm)")
plt.ylabel("Price")
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis = "y", alpha = 0.5)
plt.subplot(2, 1, 2)
sns.scatterplot(diamonds, x = "z", y = "carat")
plt.xlabel("Depth (mm)")
plt.ylabel("Carat")
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis = "y", alpha = 0.5)
plt.show()
"""# Comparing the relationship of diamond carat with price
"""
plt.figure(figsize=(17, 5))
sns.scatterplot(diamonds, x = "carat", y = "price")
plt.xlabel("Carat")
plt.ylabel("Price")
plt.title("Price and Carat Relationship")
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis = "y", alpha = 0.5)
plt.show()
"""Based on the presented graphs, it is evident that the length, width and depth of a diamond have a more reliable relationship with its weight in carats than with its price. Therefore, when determining the value of a diamond with the minimum of necessary measurements, we can rely on the carat data provided. Physical dimensions, such as length, width and depth, provide a more accurate indication of diamond weight than its monetary value.
However, it is important to note that this does not mean that we cannot use length, width and depth measurements to estimate the value of a diamond. On the contrary, the more information we have, the more accurate the estimate of the diamond's price will be. However, if we have to choose the minimum amount of information to estimate the value of a diamond, we can say that the carat is sufficient for this assessment.
# Three ways to estimate the price of the diamond for the program user:
1.Request the mass of the diamond from the customer:
$$ \text{Carat} = \frac{\text{Mass (mg)}}{200}$$
2.When the user provides the diamond points:
$$Carat = \frac{\text{Points (pt)}}{100}$$
3.For the second way to estimate the carat of the diamond, 4 things are needed: Length (mm), Width (mm), Depth (mm) and density ($\frac{mm}{mm³}$). We will use the density calculation of an object to first calculate the mass of the diamond:
$$Density = \frac{Mass}{Volume} \rightarrow Mass = Density \times Volume$$
However, we don't have the volume of the diamond. For that, we will break down the volume calculation of an object, being:
$$Volume = Comprimento \times Largura \times Profundidade$$
Substituting into the formula then, it will be:
Now we will have to find the carat of the diamond. For this, we will use form 1 to estimate the diamond calculation:
$$Carat = \frac{Mass (mg)}{200}$$
Being in the general formula:
$$Carat = \frac{Density \times Volume}{200}$$ $$\text{OU}$$
$$Carat = \frac{Lenght \times Width \times Depyh \times Density}{200} $$
# Price Relationship with Categorical Columns
Descriptive statistics of the dataset
"""
diamonds.describe()
"""# Price Relationship with Categorical Columns - Descriptive statistics of the dataset
"""
description = diamonds.describe()
price = [f"until ${description.iloc[4, 3]}",
f"until ${description.iloc[5, 3]}",
f"until ${description.iloc[6, 3]}",
f"greater than ${description.iloc[6, 3]}"]
carat = [f"until ${description.iloc[4, 0]}",
f"until ${description.iloc[5, 0]}",
f"until ${description.iloc[6, 0]}",
f"greater than ${description.iloc[6, 0]}"]
"""# Function to group data by price or carat intervals
"""
def agrupamento(diamonds, coluna, index_coluna: list):
if coluna == "price":
coluna_aux = 3
else:
coluna_aux = 0
description = diamonds.describe()
cut = pd.DataFrame({"Fair": [0.0 for x in range(4)],
"Good": [0.0 for x in range(4)],
"Very Good": [0.0 for x in range(4)],
"Premium": [0.0 for x in range(4)],
"Ideal": [0.0 for x in range(4)]},
index = index_coluna)
color = pd.DataFrame({"J": [0.0 for x in range(4)],
"D": [0.0 for x in range(4)],
"I": [0.0 for x in range(4)],
"E": [0.0 for x in range(4)],
"F": [0.0 for x in range(4)],
"H": [0.0 for x in range(4)],
"G": [0.0 for x in range(4)]},
index = index_coluna)
clarity = pd.DataFrame({"I1": [0.0 for x in range(4)],
"IF": [0.0 for x in range(4)],
"VVS1": [0.0 for x in range(4)],
"VVS2": [0.0 for x in range(4)],
"VS1": [0.0 for x in range(4)],
"VS2": [0.0 for x in range(4)],
"SI2": [0.0 for x in range(4)],
"SI1": [0.0 for x in range(4)]},
index = index_coluna)
for intervalo in ["25%", "50%", "75%", "max"]:
if intervalo == "25%":
diamonds_aux = diamonds[diamonds[coluna] <= diamonds.describe()[coluna][intervalo]].reset_index()
elif intervalo == "50%":
diamonds_aux = diamonds[diamonds[coluna] > diamonds.describe()[coluna]["25%"]].reset_index()
diamonds_aux = diamonds_aux[diamonds_aux[coluna] <= diamonds.describe()[coluna][intervalo]].reset_index()
elif intervalo == "75%":
diamonds_aux = diamonds[diamonds[coluna] > diamonds.describe()[coluna]["50%"]].reset_index()
diamonds_aux = diamonds_aux[diamonds_aux[coluna] <= diamonds.describe()[coluna][intervalo]].reset_index()
else:
diamonds_aux = diamonds[diamonds[coluna] > diamonds.describe()[coluna]["75%"]].reset_index()
describe = diamonds.describe()[coluna][intervalo]
for x in range(diamonds_aux.shape[0]):
for y in range(cut.shape[1]):
if diamonds_aux.loc[x, "cut"] == cut.columns[y]:
try:
cut.loc[f"until ${describe}", cut.columns[y]] += 1.0
except KeyError:
cut.loc[f"greater than ${description.iloc[6, coluna_aux]}", cut.columns[y]] += 1.0
break
for y in range(color.shape[1]):
if diamonds_aux.loc[x, "color"] == color.columns[y]:
try:
color.loc[f"until ${describe}", color.columns[y]] += 1.0
except KeyError:
color.loc[f"greater than ${description.iloc[6, coluna_aux]}", color.columns[y]] += 1.0
break
for y in range(clarity.shape[1]):
if diamonds_aux.loc[x, "clarity"] == clarity.columns[y]:
try:
clarity.loc[f"until ${describe}", clarity.columns[y]] += 1.0
except (KeyError, KeyboardInterrupt):
clarity.loc[f"greater than ${description.iloc[6, coluna_aux]}", clarity.columns[y]] += 1.0
break
soma_cut = [sum(cut.iloc[:, x]) for x in range(cut.shape[1])]
soma_color = [sum(color.iloc[:, x]) for x in range(color.shape[1])]
soma_clarity = [sum(clarity.iloc[:, x]) for x in range(clarity.shape[1])]
for x in range(4):
for y in range(cut.shape[1]):
cut.iloc[x, y] = round(cut.iloc[x, y] / soma_cut[y], 4).astype(float)
for y in range(color.shape[1]):
color.iloc[x, y] = round(color.iloc[x, y] / soma_color[y], 4).astype(float)
for y in range(clarity.shape[1]):
clarity.iloc[x, y] = round(clarity.iloc[x, y] / soma_clarity[y], 4).astype(float)
if "carat" == coluna:
cut.index = [f"until {description.iloc[4, 0]}",
f"until {description.iloc[5, 0]}",
f"until {description.iloc[6, 0]}",
f"greater than {description.iloc[6, 0]}"]
color.index = [f"until {description.iloc[4, 0]}",
f"until {description.iloc[5, 0]}",
f"until {description.iloc[6, 0]}",
f"greater than {description.iloc[6, 0]}"]
clarity.index = [f"until {description.iloc[4, 0]}",
f"until {description.iloc[5, 0]}",
f"until {description.iloc[6, 0]}",
f"greater than {description.iloc[6, 0]}"]
return cut, color, clarity
"""# Function to group data by price or carat intervals"""
cut, color, clarity = agrupamento(diamonds, "price", price)
cut_carat, color_carat, clarity_carat = agrupamento(diamonds, "carat", carat)
"""# Analyzing the generated tables:
The command above creates six tables that show, in percentages, the amount of diamonds with certain characteristics within specific value intervals. In addition, three other similar tables are generated, but instead of grouping the data by price, they are grouped by carat weight.
# Although the cut has a more significant relationship with the carat of a diamond, when we use this feature to determine the value of the jewel, we can more accurately estimate a price range for the diamond registered on the website.
"""
color
cut
cut_carat
clarity_carat
"""# Draft: mention that length, width and depth make up the carat of a diamond, while cut, clarity and color can give you a clue about the price, or how it is more influential on the carat.#"""