Author: Tim Siwula

Proposal: http://bit.ly/2gcCLQ4

Kaggle: http://bit.ly/2gMVpPG

Github: http://bit.ly/2gZoTwy

Data: http://bit.ly/2fQ0LHW

Project description

We are trying to predict the click probability for a given article displayed to a user online. The target variable is if a user clicked or not. We are working on featuring engineering given a multitude of data sets. The target variable name is click_prob.

output: pdf_document library(knitr) library(markdown)

#transform the .Rmd to a markdown (.md) file. knit('tims_script.Rmd')

########################### SET UP DATABASE CONNECTION ################################

require("RPostgreSQL")    #install.packages("RPostgreSQL")
driver <- dbDriver("PostgreSQL")   # loads the PostgreSQL driver

# creates a connection to the postgres database
# note that "con" will be used later in each connection to the database
connection <- dbConnect(driver, dbname = "clickprediction",
                 host = "localhost", port = 5432,
                 user = "admin", password = "admin")
dbExistsTable(connection, "clicks_train")  # confirm the tables are accessible

## [1] TRUE

####################################################################################

########################### QUERY THE DATABASE ########################################

# 1)
# try to find features related to ad_id.
# here we join click_train and promoted-content with ad_id.

# look at clicks_train first
getClicksTrain="select * from clicks_train limit 10 "
clicks_train = dbGetQuery(connection, getClicksTrain)
clicks_train

##    display_id  ad_id clicked
## 1           1  42337       0
## 2           1 139684       0
## 3           1 144739       1
## 4           1 156824       0
## 5           1 279295       0
## 6           1 296965       0
## 7           2 125211       0
## 8           2 156535       0
## 9           2 169564       0
## 10          2 308455       1

# look at promoted_content next
getPromotedContent="select * from promoted_content limit 10"
promoted_content = dbGetQuery(connection, getPromotedContent)
promoted_content

##    ad_id document_id campaign_id advertiser_id
## 1      1        6614           1             7
## 2      2      471467           2             7
## 3      3        7692           3             7
## 4      4      471471           2             7
## 5      5      471472           2             7
## 6      6       12736           1             7
## 7      7       12808           1             7
## 8      8      471477           2             7
## 9      9       13379           1             7
## 10    10       13885           1             7

# join click_train and promoted-content with ad_id new table
# 500k apears to be stable with rstudio.
join_query = "
select t.display_id, t.ad_id, t.clicked, d.document_id,
d.topic_id, d.confidence_level
from clicks_train t, promoted_content p, documents_topics d
where t.ad_id = p.ad_id and p.document_id = d.document_id
limit 500000;"
merged_table=dbGetQuery(connection, join_query)
head(merged_table, 3)

##   display_id  ad_id clicked document_id topic_id confidence_level
## 1          6 180923       0     1151028       74       0.09882334
## 2          6 180923       0     1151028       16       0.17285361
## 3         48 180923       0     1151028       74       0.09882334

dim(merged_table)

## [1] 500000      6

####################################################################################

########################### Create new table -- write new table ########################################

dbWriteTable(connection, "merged_table", merged_table, row.names=FALSE)

## Warning in postgresqlWriteTable(conn, name, value, ...): table merged_table
## exists in database: aborting assignTable

## [1] FALSE

# look at the new table
getMergedTable="select * from merged_table limit 500000"

new_table = dbGetQuery(connection, getMergedTable)

# list the structure of mydata
str(new_table)

## 'data.frame':	500000 obs. of  6 variables:
##  $ display_id      : int  13339482 13339482 13339490 13339518 13339520 13339520 13339520 13339520 13339520 13339520 ...
##  $ ad_id           : int  180923 180923 526553 444495 389774 389774 389774 389774 389774 389774 ...
##  $ clicked         : int  0 0 1 0 1 1 1 1 1 1 ...
##  $ document_id     : int  1151028 1151028 2436524 737710 1804537 1804537 1804537 1804537 1804537 1804537 ...
##  $ topic_id        : int  74 16 197 291 159 211 16 66 184 97 ...
##  $ confidence_level: num  0.09882 0.17285 0.02792 0.22343 0.00802 ...

####################################################################################

############################################################################## 3) Models and response Models to consider: Start with the classification problem first - easier problem - start with decision trees, random forests and bagging Then switch to regression

require(tree)

## Loading required package: tree

str(new_table)

## 'data.frame':	500000 obs. of  6 variables:
##  $ display_id      : int  13339482 13339482 13339490 13339518 13339520 13339520 13339520 13339520 13339520 13339520 ...
##  $ ad_id           : int  180923 180923 526553 444495 389774 389774 389774 389774 389774 389774 ...
##  $ clicked         : int  0 0 1 0 1 1 1 1 1 1 ...
##  $ document_id     : int  1151028 1151028 2436524 737710 1804537 1804537 1804537 1804537 1804537 1804537 ...
##  $ topic_id        : int  74 16 197 291 159 211 16 66 184 97 ...
##  $ confidence_level: num  0.09882 0.17285 0.02792 0.22343 0.00802 ...

hist(new_table$confidence_level)

hist(new_table$clicked)

hist(new_table$display_id)

hist(new_table$ad_id)

hist(new_table$document_id)

hist(new_table$topic_id)

attach(new_table)
confidence20=ifelse(new_table$confidence_level>=0.2,"Yes","No")
str(confidence20)

##  chr [1:500000] "No" "No" "No" "Yes" "No" "No" "No" ...

new_table=data.frame(new_table, confidence20)
tree.local.train.c20=tree(confidence20~.-new_table$confidence_level,data=new_table)
summary(tree.local.train.c20)

##
## Classification tree:
## tree(formula = confidence20 ~ . - new_table$confidence_level,
##     data = new_table)
## Variables actually used in tree construction:
## [1] "confidence_level"
## Number of terminal nodes:  2
## Residual mean deviance:  0 = 0 / 5e+05
## Misclassification error rate: 0 = 0 / 5e+05

plot(tree.local.train.c20)
text(tree.local.train.c20,pretty=0)

##############################################################################

########################### create a training set ###########################################

# list the structure of mydata
str(new_table)

## 'data.frame':	500000 obs. of  7 variables:
##  $ display_id      : int  13339482 13339482 13339490 13339518 13339520 13339520 13339520 13339520 13339520 13339520 ...
##  $ ad_id           : int  180923 180923 526553 444495 389774 389774 389774 389774 389774 389774 ...
##  $ clicked         : int  0 0 1 0 1 1 1 1 1 1 ...
##  $ document_id     : int  1151028 1151028 2436524 737710 1804537 1804537 1804537 1804537 1804537 1804537 ...
##  $ topic_id        : int  74 16 197 291 159 211 16 66 184 97 ...
##  $ confidence_level: num  0.09882 0.17285 0.02792 0.22343 0.00802 ...
##  $ confidence20    : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 2 ...

partition_size = floor(0.80 * nrow(new_table)) ## 80% of the sample size
str(partition_size)

##  num 4e+05

set.seed(123) ## set the seed to make your partition reproductible
partition_index <- sample(seq_len(nrow(new_table)), size = partition_size)
local_train_set <- new_table[partition_index, ]
local_test_set <- new_table[-partition_index, ]

# list the structure of mydata
str(local_test_set)

## 'data.frame':	100000 obs. of  7 variables:
##  $ display_id      : int  13339520 13339520 13339539 13339605 13339805 13339805 13339945 13340028 13340034 13340092 ...
##  $ ad_id           : int  389774 389774 400648 509131 528174 528174 90197 528174 180923 180923 ...
##  $ clicked         : int  1 1 0 0 0 0 0 1 0 0 ...
##  $ document_id     : int  1804537 1804537 1804537 2436524 2442282 2442282 483905 2442282 1151028 1151028 ...
##  $ topic_id        : int  184 97 66 197 260 124 140 260 16 16 ...
##  $ confidence_level: num  0.11432 0.21158 0.08875 0.02792 0.00818 ...
##  $ confidence20    : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...

# print first 10 rows of mydata
#head(mydata, n=10)

##############################################################################

Name		Name	Last commit message	Last commit date
Latest commit History 27 Commits
About		About
data		data
figure		figure
scripts		scripts
.RData		.RData
.Rhistory		.Rhistory
.gitignore		.gitignore
Icon		Icon
LICENSE		LICENSE
README.md		README.md
data.Rproj		data.Rproj

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Author: Tim Siwula

Proposal: http://bit.ly/2gcCLQ4

Kaggle: http://bit.ly/2gMVpPG

Github: http://bit.ly/2gZoTwy

Data: http://bit.ly/2fQ0LHW

Project description

About

Releases

Packages

Languages

License

timxor/click_prediction

Folders and files

Latest commit

History

Repository files navigation

Author: Tim Siwula

Proposal: http://bit.ly/2gcCLQ4

Kaggle: http://bit.ly/2gMVpPG

Github: http://bit.ly/2gZoTwy

Data: http://bit.ly/2fQ0LHW

Project description

About

Topics

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages