forked from r4ds/bookclub-dsieur
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Chapter 7 online science course data.R
417 lines (346 loc) · 13.7 KB
/
Chapter 7 online science course data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# useful shortcut#
# Run current line/selection: cmd + return (MAC), ctrl + Enter (Windows)
# Assignment sign (<-) : Option + - (M), Alt + - (W)
# Pipe sign (%) <- : cmd + shift + M (M), ctrl + shift + M (W)
# install.packages(c("tidyverse", "apaTables", "sjPlot", "dataedu", "summarytools", "ggpubr"))
# dataedu wasn't available for R ver 3.6.3 so I installed dev version of dataedu
remotes::install_github("data-edu/dataedu")
# this was not in the book but useful to get descriptives
install.packages("summarytools")
# Load packages
library(tidyverse)
library(apaTables)
library(sjPlot)
library(readxl)
library(dataedu)
library(summarytools)
library(ggpubr)
############################
# import data from dataedu #
############################
# Pre-survey for the F15 and S16 semesters
pre_survey <- dataedu::pre_survey
# Gradebook and log-trace data for F15 and S16 semesters
course_data <- dataedu::course_data
# Log-trace data for F15 and S16 semesters - this is for time spent
course_minutes <- dataedu::course_minutes
#############
# view data #
#############
head(pre_survey) # first six rows
View(pre_survey) # a full view in a separate tab
glimpse(pre_survey) # list of variables, values for the first couple of cases
# get a quick look at each variable in df
view(dfSummary(pre_survey))
## take a look at course_data, course_minutes; what do you notice?
########################
# 1.process pre_survey #
########################
pre_survey <-
pre_survey %>%
# Rename the qustions something easier to work with because R is case sensitive
# and working with variable names in mix case is prone to error
rename(
q1 = Q1MaincellgroupRow1,
q2 = Q1MaincellgroupRow2,
q3 = Q1MaincellgroupRow3,
q4 = Q1MaincellgroupRow4,
q5 = Q1MaincellgroupRow5,
q6 = Q1MaincellgroupRow6,
q7 = Q1MaincellgroupRow7,
q8 = Q1MaincellgroupRow8,
q9 = Q1MaincellgroupRow9,
q10 = Q1MaincellgroupRow10
) %>%
# Convert all question responses to numeric
mutate_at(vars(q1:q10), list( ~ as.numeric(.))) # q1-10 are already numeric, so this doesn't seem necessary
# you could insert suffix to the var names indicate the three dimensions
# e.g. q1.i, q2.u, 3.p where i = interest, u = utility, p = perceived competence
###########################################
#1a.practice mutate = making a new variable #
###########################################
# create a df (dataframe) with two columns/vars, male & females
df <- tibble(
male = 5,
female = 5
)
# Use mutate to create a new column called "total_students"
# populate that column with the sum of the "male" and "female" variables
df %>% mutate(total_students = male + female)
# let's keep this new column in df
df <- df %>% mutate(total_students = male + female)
######################################################
# 1b. reverse_score function with mutate & case_when #
######################################################
# This part of the code is where we write the function:
# Function for reversing scales
reverse_scale <- function(question) {
# Reverses the response scales for consistency
# Arguments:
# question - survey question
# Returns:
# a numeric converted response
# Note: even though 3 is not transformed, case_when expects a match for all
# possible conditions, so it's best practice to label each possible input
# and use TRUE ~ as the final statement returning NA for unexpected inputs
x <- case_when(
question == 1 ~ 5,
question == 2 ~ 4,
question == 3 ~ 3,
question == 4 ~ 2,
question == 5 ~ 1,
TRUE ~ NA_real_
)
x
}
# let's see how it works
reverse_scale(pre_survey$q4)
pre_survey$q4
# And here's where we use that function to reverse the scales
# We use the pipe operator %>% here
# Reverse scale for questions 4 and 7
pre_survey <-
pre_survey %>%
mutate(q4 = reverse_scale(q4), # mutate with the original var name to overwrite
q7 = reverse_scale(q7))
# Note: psych package has reverse.code() function so you don't have to write your own function
#####################################################
#1c. pivot_longer to make pre_survey into long form #
#####################################################
# Pivot the dataset from wide to long format
# And name the long format df as measure_mean
measure_mean <-
pre_survey %>%
# Gather questions and responses
pivot_longer(cols = q1:q10,
names_to = "question", # give a new var/col name "question" where question # will go
values_to = "response") # give a new var/col name "response" where response values will go
# create a new var called measure to denote 3 dimensions of motivation
measure_mean <- measure_mean %>%
# Here's where we make the column of question categories called "measure"
mutate(
measure = case_when(
question %in% c("q1", "q4", "q5", "q8", "q10") ~ "int",
question %in% c("q2", "q6", "q9") ~ "uv",
question %in% c("q3", "q7") ~ "pc",
TRUE ~ NA_character_)
)
###################################################
# 1d. Get mean scores for each motivation measure #
# Across ~912 students who responded the pre-survey
# using group_by() and summarize()
###################################################
measure_mean <- measure_mean %>%
# First, we group by the new variable "measure"
group_by(measure) %>%
# Here's where we compute the mean of the responses
summarize(
# Creating a new variable to indicate the mean response for each measure
mean_response = mean(response, na.rm = TRUE),
# Creating a new variable to indicate the percent of each measure that
# had NAs in the response field
percent_NA = mean(is.na(response))
)
measure_mean
##############################
# 2. Process the course data #
##############################
View(course_data)
# split course section into components
course_data <-
course_data %>%
# Give course subject, semester, and section their own columns
separate(
col = CourseSectionOrigID,
into = c("subject", "semester", "section"),
sep = "-",
remove = FALSE # this is to keep the original var
)
#############################################
# 3. Join/merge course_data with pre_survey #
#############################################
#rename pre_survey id vars
pre_survey <-
pre_survey %>%
rename(student_id = opdata_username, #new_var_name = old_var_name
course_id = opdata_CourseID)
pre_survey
################################################
#3a. extract 5 digits inbetween _ _ in student_id #
################################################
#trying str_sub just with one string value
str_sub("_99888_1", start = 2)
str_sub("_99888_1", start = -3)
str_sub("_99888_1", start = 2, end = -3)
# Re-create the variable "student_id" so that it excludes the extraneous characters
pre_survey <- pre_survey %>%
mutate(student_id = str_sub(student_id, start = 2, end = -3))
# Save the new variable as numeric so that R no longer thinks it is text
pre_survey <- pre_survey %>%
mutate(student_id = as.numeric(student_id))
###########################################################
#3b rename id vars in course_data and join with pre_survey
##########################################################
course_data <-
course_data %>%
rename(student_id = Bb_UserPK,
course_id = CourseSectionOrigID)
# new df merges course_data with pre_survey
dat <-
left_join(course_data, pre_survey,
by = c("student_id", "course_id"))
dat
############################################
#4. Process course_minutes & join with dat
############################################
course_minutes <-
course_minutes %>%
rename(student_id = Bb_UserPK,
course_id = CourseSectionOrigID)
course_minutes <-
course_minutes %>%
# Change the data type for student_id in course_minutes so we can match to
# student_id in dat
mutate(student_id = as.integer(student_id))
dat <-
dat %>%
left_join(course_minutes,
by = c("student_id", "course_id"))
# dat has many gradebook_items per student per course
# we want just one row per student & course combo
# using distinct()
dat <-
distinct(dat, course_id, student_id, .keep_all = TRUE)
# rename final grade var
dat <- rename(dat, final_grade = FinalGradeCEMS)
###########################################
# 5. Analysis
###########################################
#######################################################################
# 5a.Scatter plot to examine relationship between final grade & time spent
#######################################################################
view(dfSummary(dat))
#scatter plot to see relationship between timespent & final grade
p1 <- dat %>%
# aes() tells ggplot2 what variables to map to what feature of a plot
# Here we map variables to the x- and y-axis
ggplot(aes(x = TimeSpent, y = final_grade)) +
# Creates a point with x- and y-axis coordinates specified above
geom_point(color = dataedu_colors("green")) +
theme_dataedu() +
labs(x = "Time Spent",
y = "Final Grade")
# add a line of best fit
p1 + geom_smooth(method = "lm")
# with ggpubr, you can add correlation to the graph
require(ggpubr)
p2 <- ggscatter(dat, x = "TimeSpent", y = "final_grade",
color = "springgreen4",
add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE # Add confidence interval
)
# Add correlation coefficient
p2 +
stat_cor(method = "pearson", label.x = 3900, label.y = 130,
p.accuracy = 0.001, r.accuracy = 0.01)
###################################################
# 5b.Linear regression with time spent as predictor
###################################################
m_linear <-
lm(final_grade ~ TimeSpent, data = dat)
summary(m_linear)
# get publication ready table with tab_model function
require(sjPlot)
tab_model(m_linear,
title = "Table 7.1")
# you can copy and paste it into Word!
# or save it with apa.re.table function
apa.reg.table(m_linear, filename = "regression-table-output.doc")
###################################################
# 5c.Correlations among the 3 motivation variables
###################################################
# pivot survey_responses to long form
survey_responses <-
pre_survey %>%
# Gather questions and responses
pivot_longer(cols = q1:q10,
names_to = "question",
values_to = "response") %>%
mutate(
# Here's where we make the column of question categories
measure = case_when(
question %in% c("q1", "q4", "q5", "q8", "q10") ~ "int",
question %in% c("q2", "q6", "q9") ~ "uv",
question %in% c("q3", "q7") ~ "pc",
TRUE ~ NA_character_
))
# create mean_response for each student for each measure
survey_responses <-
survey_responses %>%
group_by(student_id, measure) %>%
# Here's where we compute the mean of the responses for each stdt & measure combo
summarize(
# Mean response for each measure
mean_response = mean(response, na.rm = TRUE)
)
# Filter NA (missing) responses and pivot to wide form
survey_responses <-
survey_responses %>%
filter(!is.na(mean_response)) %>%
pivot_wider(names_from = measure,
values_from = mean_response)
survey_responses
# get correlation table
survey_responses %>%
apa.cor.table(filename = "corr-table-output.doc")
# note the correlation table includes student_id.
# probably want to delete it in Word
#############################################################################
# 5d. Linear regression with hours sptent (rather than minutes) as predictor
############################################################################
# creating a new variable for the amount of time spent in hours
dat <-
dat %>%
mutate(TimeSpent_hours = TimeSpent / 60)
# the same linear model as above, but with the TimeSpent variable in hours
m_linear_1 <-
lm(final_grade ~ TimeSpent_hours, data = dat)
# viewing the output of the linear model
tab_model(m_linear_1,
title = "Table 7.2")
##################################################################
# 5e. Linear regression with standardized time spent as predictor
##################################################################
# this is to standardize the TimeSpent variable to have a mean of 0 and a standard deviation of 1
# this makes intercept more interpretable
dat <-
dat %>%
mutate(TimeSpent_std = scale(TimeSpent))
# the same linear model as above, but with the TimeSpent variable standardized
m_linear_2 <-
lm(final_grade ~ TimeSpent_std, data = dat)
# viewing the output of the linear model
tab_model(m_linear_2,
title = "Table 7.3")
#####################################################################
#6. Multiple regression model with time spent & subject as predictors
#####################################################################
# a linear model with the subject added
# independent variables, such as TimeSpent_std and subject, can simply be separated with a plus symbol:
m_linear_3 <-
lm(final_grade ~ TimeSpent_std + subject, data = dat)
# note: subject is a categorical variable and it seems AnPhA (animal physiology)
# is set as the reference category (numbers assinged by alphabetical order)
tab_model(m_linear_3,
title = "Table 7.4")
# Combine all four models in one table & show standard errors rather than CIs
tab_model(m_linear, m_linear_1, m_linear_2, m_linear_3,
show.ci = FALSE, show.se = TRUE)
#####################################################################
#7. What other analyses can you think of?
#####################################################################
# Add total scores of pre-course motivation as a predictor?
# --> use mutate to create sum_motiv variable
# Does the effect of time spent vary by subjects/courses?
# --? add time x subject interaction term