-
Notifications
You must be signed in to change notification settings - Fork 1
/
demo_grouping.py
58 lines (43 loc) · 1.58 KB
/
demo_grouping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
import os
import numpy as np
csv_path = os.path.join(r"I:\Web Developments\Python\VSCode\Panadas\Demo - Json\TateGallery", "artwork_data.csv")
cols_to_use = ['id', 'artist', 'title', 'medium', 'year', 'acquisitionYear', 'height', 'width', 'units']
df = pd.read_csv(csv_path, index_col='id', usecols=cols_to_use)
'''
print(df.groupby('artist'))
print(df.describe(include='all'))
print(df.describe(include=[np.number]))'''
#Get the copy of the data
small_df = df.iloc[49980:50019, :].copy()
#print(small_df)
#Grouping the data
'''
grouped = small_df.groupby('artist')
for name, group_df in grouped:
print(name)
print(group_df)'''
grouped = small_df.groupby('artist')
'''
for name, groupby_df in grouped:
min_year = groupby_df['acquisitionYear'].min()
print("{}: {}".format(name, min_year))'''
def fiil_values(series):
# Get the value count for the Medium series
values_counted = series.value_counts(ascending=False)
if values_counted.empty:
return series
#If the value count is desending order get the most used one by refering firts index
most_frequent = values_counted.index[0]
#Fill NA/NaN values using the specified method
new_medium = series.fillna(most_frequent)
return new_medium
grouped_medium = small_df.groupby('artist')['medium']
small_df.loc[:, 'medium'] = grouped_medium.transform(fiil_values)
#Min
df.groupby('artist').agg(np.min)
df.groupby('artist').min()
#Filter
grouped_title = df.groupby('title')
title_count = grouped_title.size().sort_values(ascending=False)
print(title_count)