-
Notifications
You must be signed in to change notification settings - Fork 0
/
x.3. data_warehousing.txt
255 lines (228 loc) · 9.26 KB
/
x.3. data_warehousing.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
// LESSON 1.1 - WHAT IS DATA WAREHOUSE
data warehouse - has oltp(operational databases) and olap(analytical databases)
business day to day work
customers sohuld be able to find goods and create new orders
inventory staff should be able to stock, retrieve and re-order goods
delivery staff should be able to pickup and deliver good
hr should be able to assesss performance of staff
marketing should be able to attract new customers
management should be able to monitor sales growth
questions for you
can i build a databases to support these activities
// LESSON 1.2 - BUSINESS PROCESS
operational process
- able to find products and make orders (customers)
- add a new products and check stocks (inventory staff)
- pick up and deliver products (delivery staff)
analytical process
- check sales performance (for hr)
- see effect of different sales channels (marketing)
- monitor sales growth (management)
- understand customer behavior (data team)
// LESSON 1.3 - UNDERSTANDING DATA SOURCE - operational and analytical has separated database but they both belong to one data warehouse
operational databases
pros
- excellent for operations
- no redundancy
- fast insert and update
cons
- too slow for analytics
- too many joins
- hard to understand
analytical databases
// LESSON 1.4 - OLTP VS OLAP - operational and analytical has separated database but they both belong to one data warehouse
oltp: operational database
olap: analytical database
// LESSON 2.1 - DATA WAREHOUSE DEFINITION
- system that enables us to support analytical processes
- is a system used for reporting and data analysis and is considered a core component of business intelligence
// LESSON 2.2 - DATA ARCHITECTURE AND ETL MODELS
data warehousing flow
-> data in
-> oltp or operational databases
-> do etl and put them in a olap databases for analytics
-> do analytics at olap databases using business intelligence(bi) apps
// LESSON 2.3 - DIMENSIONAL MODELING (STAR SCHEMA)
dimensional model
- make it easy for business users to work with the data
- improve analytical queries performance
- the technologies used for storing dimensional models are different than traditional technologies
characteristics
- has one fact table and multiple dimension table
// LESSON 2.4 - FACTS VS DIMENSIONAL TABLES
fact table
- collection of dimension table
- its fields are primary key of other tables
dimension table
- what you expect in a table
- holds description of attributes
// LESSON 3.1 - SCHEMA DESIGN AND USING SQL FOR DATA ANALYSIS
- bascially just query here and see what you can extract, that's it baby
// LESSON 4.1 - BUILDING STAR SCHEMA
CREATE TABLE dimDate
(
date_key integer NOT NULL PRIMARY KEY,
date date NOT NULL,
year smallint NOT NULL,
quarter smallint NOT NULL,
month smallint NOT NULL,
day smallint NOT NULL,
week smallint NOT NULL,
is_weekend boolean
);
INSERT INTO dimDate (date_key, date, year, quarter, month, day, week, is_weekend)
SELECT DISTINCT(TO_CHAR(payment_date :: DATE, 'yyyyMMDD')::integer) AS date_key,
date(payment_date) AS date,
EXTRACT(year FROM payment_date) AS year,
EXTRACT(quarter FROM payment_date) AS quarter,
EXTRACT(month FROM payment_date) AS month,
EXTRACT(day FROM payment_date) AS day,
EXTRACT(week FROM payment_date) AS week,
CASE WHEN EXTRACT(ISODOW FROM payment_date) IN (6, 7) THEN true ELSE false END AS is_weekend
FROM payment;
DROP TABLE IF EXISTS dimCustomer;
CREATE TABLE dimCustomer
(
customer_key SERIAL PRIMARY KEY,
customer_id smallint NOT NULL,
first_name varchar(45) NOT NULL,
last_name varchar(45) NOT NULL,
email varchar(50),
address varchar(50) NOT NULL,
address2 varchar(50),
district varchar(20) NOT NULL,
city varchar(50) NOT NULL,
country varchar(50) NOT NULL,
postal_code varchar(10),
phone varchar(20) NOT NULL,
active smallint NOT NULL,
create_date timestamp NOT NULL,
start_date date NOT NULL,
end_date date NOT NULL
);
INSERT INTO dimCustomer (customer_key, customer_id, first_name, last_name, email, address,
address2, district, city, country, postal_code, phone, active,
create_date, start_date, end_date)
SELECT c.customer_id as customer_key,
c.customer_id,
c.first_name,
c.last_name,
c.email,
a.address,
a.address2,
a.district,
ci.city,
co.country,
postal_code,
a.phone,
c.active,
c.create_date,
now() AS start_date,
now() AS end_date
FROM customer c
JOIN address a ON (c.address_id = a.address_id)
JOIN city ci ON (a.city_id = ci.city_id)
JOIN country co ON (ci.country_id = co.country_id);
CREATE TABLE dimMovie
(
movie_key SERIAL PRIMARY KEY,
film_id smallint NOT NULL,
title varchar(255) NOT NULL,
description text,
release_year year,
language varchar(20) NOT NULL,
original_language varchar(20),
rental_duration smallint NOT NULL,
length smallint NOT NULL,
rating varchar(5) NOT NULL,
special_features varchar(60) NOT NULL
);
INSERT INTO dimMovie (movie_key, film_id, title, description, release_year, language, original_language, rental_duration, length, rating, special_features)
SELECT
f.film_id as movie_key,
f.film_id,
f.title,
f.description,
f.release_year,
l.name as language,
orig_lang.name AS original_language,
f.rental_duration,
f.length,
f.rating,
f.special_features
FROM film f
JOIN language l ON (f.language_id=l.language_id)
LEFT JOIN language orig_lang ON (f.language_id = orig_lang.language_id);
CREATE TABLE dimStore
(
store_key SERIAL PRIMARY KEY,
store_id smallint NOT NULL,
address varchar(50) NOT NULL,
address2 varchar(50),
district varchar(20) NOT NULL,
city varchar(50) NOT NULL,
country varchar(50) NOT NULL,
postal_code varchar(10),
manager_first_name varchar(45) NOT NULL,
manager_last_name varchar(45) NOT NULL,
start_date date NOT NULL,
end_date date NOT NULL
);
INSERT INTO dimStore (store_key, store_id, address, address2, district, city, country, postal_code, manager_first_name, manager_last_name, start_date, end_date)
SELECT
s.store_id as store_key,
s.store_id,
a.address,
a.address2,
a.district,
c.city,
co.country,
a.postal_code,
st.first_name as manager_first_name,
st.last_name as manager_last_name,
now() as start_date,
now() as end_date
FROM store s
JOIN staff st ON (s.manager_staff_id = st.staff_id)
JOIN address a ON (s.address_id = a.address_id)
JOIN city c ON (a.city_id = c.city_id)
JOIN country co ON (c.country_id = co.country_id);
DROP TABLE IF EXISTS factSales;
CREATE TABLE factSales
(
sales_key SERIAL PRIMARY KEY,
date_key integer REFERENCES dimDate (date_key),
customer_key integer REFERENCES dimCustomer (customer_key),
movie_key integer REFERENCES dimMovie (movie_key),
store_key integer REFERENCES dimStore (store_key),
sales_amount numeric
);
INSERT INTO factSales (date_key, customer_key, movie_key, store_key, sales_amount)
SELECT
TO_CHAR(payment_date :: DATE, 'yyyyMMDD')::integer AS date_key,
p.customer_id as customer_key,
i.film_id as movie_key,
i.store_id as store_key,
p.amount as sales_amount
FROM payment p
JOIN rental r ON (p.rental_id = r.rental_id)
JOIN inventory i ON (r.inventory_id = i.inventory_id);
-- start schema
SELECT dimMovie.title, dimDate.month, dimCustomer.city, sum(sales_amount) as revenue
FROM factSales
JOIN dimMovie on (dimMovie.movie_key = factSales.movie_key)
JOIN dimDate on (dimDate.date_key = factSales.date_key)
JOIN dimCustomer on (dimCustomer.customer_key = factSales.customer_key)
group by (dimMovie.title, dimDate.month, dimCustomer.city)
order by dimMovie.title, dimDate.month, dimCustomer.city, revenue desc;
-- 3nf
SELECT f.title, EXTRACT(month FROM p.payment_date) as month, ci.city, sum(p.amount) as revenue
FROM payment p
JOIN rental r ON ( p.rental_id = r.rental_id )
JOIN inventory i ON ( r.inventory_id = i.inventory_id )
JOIN film f ON ( i.film_id = f.film_id)
JOIN customer c ON ( p.customer_id = c.customer_id )
JOIN address a ON ( c.address_id = a.address_id )
JOIN city ci ON ( a.city_id = ci.city_id )
group by (f.title, month, ci.city)
order by f.title, month, ci.city, revenue desc;