This document illustrates some Exploratory Data Analysis (EDA) (Tukey 1977) of Canvas exam grades. The data are wrangled with tidyverse
and dplyr
(Wickham 2019) functions, then plotted with ggplot2
(Wickham et al. 2020) and ggstatsplot
(Patil 2021).
We begin by loading the necessary libraries.
# Read CSV file
library(readr)
# Data wrangling and formatting
library(tidyverse)
library(kableExtra)
library(knitr)
opts_chunk$set(comment=NA,
prompt=FALSE,
cache=FALSE,
echo=TRUE,
results='asis')
# Stats
library(summarytools)
# Plotting
library(ggplot2)
library(ggstatsplot)
Since we are working with multiple classes, in this case eight comparable instances, the files werepulled from canvas and then saved to a csv file. Here that file is read from disk and wrangled into shape.
gb_list <- read_csv("data/gb_list.csv")
Warning: Missing column names filled in: 'X1' [1]
-- Column specification --------------------------------------------------------
cols(
X1 = col_double(),
anon_id = col_character(),
assignment_name = col_character(),
score = col_double(),
course_id = col_double(),
course_name = col_character()
)
gb_list <- gb_list %>%
mutate(course_id = as_factor(course_id)) %>%
mutate(course_name = as_factor(course_name))
The base R factor()
function is applied to order the classes chronologically. This ensures proper ordering in the graphs that follow.
# order the factors properly
gb_list$course_name <- factor(gb_list$course_name,
levels= c(
"ANTH-125G S19",
"ANTH-125G F19",
"HON-235G F19",
"ANTH-125G S20",
"ANTH-125G F20",
"HON-235G F20",
"ANTH-1140G S21",
"HNRS-2161G S21"
))
library(kableExtra)
gb_list %>%
filter(str_detect(assignment_name, "^Exam1|^EXAM1")) %>%
group_by(course_name) %>%
select(assignment_name, score, course_name) %>%
descr(stats = c("mean", "sd", "min", "med", "max", "skewness", "kurtosis"),
transpose = TRUE,
headings = FALSE
) %>%
tb() %>%
kable(format = "html", digits = 2) %>%
kable_styling(bootstrap_options = "striped")
course_name | variable | mean | sd | min | med | max | skewness | kurtosis |
---|---|---|---|---|---|---|---|---|
ANTH-125G S19 | score | 75.14 | 16.70 | 0 | 78.0 | 96 | -1.98 | 5.73 |
ANTH-125G F19 | score | 105.63 | 19.64 | 60 | 108.0 | 144 | -0.38 | -0.45 |
HON-235G F19 | score | 124.21 | 17.06 | 87 | 127.5 | 144 | -0.53 | -0.89 |
ANTH-125G S20 | score | 110.02 | 16.40 | 69 | 111.0 | 144 | -0.36 | 0.02 |
ANTH-125G F20 | score | 114.84 | 19.28 | 66 | 114.0 | 150 | -0.30 | -0.43 |
HON-235G F20 | score | 134.29 | 11.60 | 99 | 135.0 | 147 | -1.51 | 2.37 |
ANTH-1140G S21 | score | 117.85 | 19.12 | 66 | 123.0 | 147 | -0.73 | 0.15 |
HNRS-2161G S21 | score | 126.82 | 12.45 | 111 | 126.0 | 150 | 0.40 | -1.27 |
gb_list %>%
filter(str_detect(assignment_name, "^Exam1|^EXAM1")) %>%
group_by(course_name) %>%
ggplot(., aes(x = score))+
geom_histogram()+
facet_wrap(~ course_name)+
labs(
title = "Exam 1 Histograms Across Classes"
)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gb_list %>%
group_by(course_id) %>%
filter(str_detect(assignment_name, "^Exam1|^EXAM1")) %>%
ggplot(.,aes(x=course_name, y = score, group=course_name))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(
title = "Exam 1 Scores Across Different Classes",
x = "Course",
y = "Exam Score"
)
gb_list %>%
group_by(course_name) %>%
filter(str_detect(assignment_name, "^Exam1|^EXAM1")) %>%
ggbetweenstats(
data = .,
x = course_name,
y = score,
pairwise.comparisons = FALSE)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(
title = "Exam 1 Scores Across Different Classes",
x = "Course",
y = "Exam Score"
)
gb_list %>%
filter(str_detect(assignment_name, "^Exam2|^EXAM2")) %>%
group_by(course_name) %>%
select(assignment_name, score, course_name) %>%
descr(stats = c("mean", "sd", "min", "med", "max", "skewness", "kurtosis"),
transpose = TRUE,
headings = FALSE) %>%
tb() %>%
kable(format = "html", digits = 2) %>%
kable_styling(bootstrap_options = "striped")
course_name | variable | mean | sd | min | med | max | skewness | kurtosis |
---|---|---|---|---|---|---|---|---|
ANTH-125G S19 | score | 69.57 | 20.36 | 0 | 75 | 94 | -2.09 | 4.59 |
ANTH-125G F19 | score | 101.49 | 24.48 | 0 | 105 | 141 | -1.84 | 6.01 |
HON-235G F19 | score | 126.64 | 12.74 | 99 | 126 | 141 | -0.49 | -0.81 |
ANTH-125G S20 | score | 114.21 | 30.53 | 0 | 120 | 147 | -2.54 | 6.75 |
ANTH-125G F20 | score | 115.08 | 18.75 | 51 | 117 | 141 | -0.84 | 0.58 |
HON-235G F20 | score | 123.53 | 33.31 | 0 | 126 | 147 | -2.93 | 8.17 |
ANTH-1140G S21 | score | 113.54 | 13.95 | 78 | 117 | 138 | -0.47 | -0.39 |
HNRS-2161G S21 | score | 119.18 | 9.59 | 108 | 120 | 141 | 0.78 | -0.13 |
gb_list %>%
filter(str_detect(assignment_name, "^Exam2|^EXAM2")) %>%
group_by(course_name) %>%
ggplot(., aes(x = score))+
geom_histogram()+
facet_wrap(~ course_name)+
labs(
title = "Exam 2 Histograms Across Classes"
)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gb_list %>%
group_by(course_id) %>%
filter(str_detect(assignment_name, "^Exam2|^EXAM2")) %>%
ggplot(.,aes(x=course_name, y = score, group=course_name))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(
title = "Exam 2 Scores Across Different Classes",
x = "Course",
y = "Exam Score"
)
gb_list %>%
group_by(course_name) %>%
filter(str_detect(assignment_name, "^Exam2|^EXAM2")) %>%
ggbetweenstats(
data = .,
x = course_id,
y = score,
pairwise.comparisons = FALSE)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(
title = "Exam 2 Scores Across Different Classes",
x = "Course",
y = "Exam Score"
)
Adding missing grouping variables: `course_name`
gb_list %>%
filter(str_detect(assignment_name, "^Exam3|^EXAM3")) %>%
group_by(course_name) %>%
select(assignment_name, score, course_name) %>%
descr(stats = c("mean", "sd", "min", "med", "max", "skewness", "kurtosis"),
transpose = TRUE,
headings = FALSE) %>%
tb() %>%
kable(format = "html", digits = 2) %>%
kable_styling(bootstrap_options = "striped")
course_name | variable | mean | sd | min | med | max | skewness | kurtosis |
---|---|---|---|---|---|---|---|---|
ANTH-125G S20 | score | 108.84 | 29.49 | 0 | 114 | 144 | -2.37 | 6.13 |
ANTH-125G F20 | score | 144.72 | 41.58 | 0 | 156 | 195 | -2.13 | 5.11 |
HON-235G F20 | score | 158.47 | 44.40 | 0 | 171 | 192 | -2.60 | 6.62 |
ANTH-1140G S21 | score | 133.54 | 56.24 | 0 | 156 | 183 | -1.64 | 1.23 |
HNRS-2161G S21 | score | 166.91 | 15.72 | 141 | 171 | 192 | -0.34 | -1.14 |
gb_list %>%
filter(str_detect(assignment_name, "^Exam3|^EXAM3|^Fin|^FIN")) %>%
group_by(course_name) %>%
ggplot(., aes(x = score))+
geom_histogram()+
facet_wrap(~ course_name)+
labs(
title = "Exam 3 Histograms Across Classes"
)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gb_list %>%
group_by(course_id) %>%
filter(str_detect(assignment_name, "^Exam3|^EXAM3|^Fin|^FIN")) %>%
ggplot(.,aes(x=course_name, y = score, group=course_name))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(
title = "Exam 3 Scores Across Different Classes",
x = "Course",
y = "Exam Score"
)
gb_list %>%
group_by(course_name) %>%
filter(str_detect(assignment_name, "^Exam3|^EXAM3|^Fin|^FIN")) %>%
ggbetweenstats(
data = .,
x = course_id,
y = score,
pairwise.comparisons = FALSE)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(
title = "Exam 3 Scores Across Different Classes",
x = "Course",
y = "Exam Score"
)
Adding missing grouping variables: `course_name`