30 Comparative Judgement Practical

Create a new Comparative Judgement task on www.nomoremarking.com and upload 8 pdfs with the numbers 1 to 8. Judge the numbers 80 times and download the decisions file.

30.1 Fitting the Bradley-Terry Model

rm(list=ls())
library(tidyverse)
library(ggplot2)
library(sirt)
decisions <- read_csv('data/decisions-1.csv')
persons <- read_csv('data/persons-1.csv')
glimpse(decisions)
glimpse(persons)

# get the person ids and their first names
persons <- persons %>% 
  select(Code, `First Name`) %>% 
  rename(id = Code, name = `First Name`)

# go through the decisions and add a column with the name of the person who was chosen or not chosen in each case
decisions <- decisions %>% left_join(persons, by = c('chosen' = 'id')) %>% 
  rename(chosenName = name) %>% 
  left_join(persons, by = c('notChosen' = 'id')) %>% 
  rename(notChosenName = name)

# mutate chosenName and notChosenName to be factors
decisions <- decisions %>% 
  mutate(chosenName = factor(chosenName), notChosenName = factor(notChosenName))

# create a name for each judge
decisions <- decisions %>% mutate(
    judge = paste0('judge_', judgeName)
)

# get a vector of judges
judges <- decisions %>% pull(judge)

# format for the btm model
decisions <- decisions %>% 
  select(chosenName, notChosenName) %>% 
  mutate(result = 1)

# the package doesn't like tibbles?
decisions <- as.data.frame(decisions)
mod1 <- btm(decisions, judge=judges, fix.eta = 0, ignore.ties = TRUE, eps = 0.3)

30.2 Explore the model results

See https://alexanderrobitzsch.github.io/sirt/reference/btm.html

30.2.1 Reliability

# The mle reliability, also known as separation reliability
print(mod1$mle.rel)

[1] 0.8900993

30.2.2 Person parameters

# The person parameters
knitr::kable(mod1$effects, digits = 2)

	individual	id	Ntot	N1	N0	raw	score	propscore	theta	se.theta	outfit	infit
8	8	8	24	24	0	24	23.70	0.99	6.42	1.86	0.01	0.05
7	7	7	21	17	4	17	16.81	0.80	3.56	1.38	0.04	0.19
6	6	6	20	16	4	16	15.82	0.79	2.65	1.15	0.05	0.19
5	5	5	20	12	8	12	11.94	0.60	0.81	1.04	0.06	0.13
4	4	4	20	7	13	7	7.09	0.35	-0.91	1.17	0.05	0.16
3	3	3	19	4	15	4	4.17	0.22	-2.01	1.16	0.05	0.15
2	2	2	21	3	18	3	3.21	0.15	-4.10	1.21	0.04	0.10
1	1	1	21	0	21	0	0.30	0.01	-6.41	1.92	0.02	0.09

30.2.3 Judge fit

# The judge fit
# Fit statistics (outfit and infit) for judges if judge is provided. In addition, average agreement of the rating with the mode of the ratings is calculated for each judge (at least three ratings per dyad has to be available for computing the agreement)
knitr::kable(mod1$fit_judges, digits = 2)

	judge	outfit	infit	agree
judge_1	judge_1	0.06	0.23	1
judge_2	judge_2	0.02	0.09	1
judge_3	judge_3	0.05	0.14	1
judge_4	judge_4	0.04	0.11	1
judge_5	judge_5	0.02	0.13	1
judge_6	judge_6	0.08	0.27	1
judge_7	judge_7	0.03	0.08	1
judge_8	judge_8	0.03	0.08	1
judge_9	judge_9	0.03	0.07	1

30.2.4 Visualise the person parameters (model effects) along with the standard error

# plot the model effects with the standard errors
# add a column to the person parameters with the upper and lower limits using 1.96 * standard error
effects <- mod1$effects %>% 
  mutate(
    lower = theta - 1.96 * se.theta,
    upper = theta + 1.96 * se.theta
  )

p <- ggplot(effects, aes(x = individual, y = theta)) +
  geom_point() +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
  theme_bw() +
  labs(x = 'Person parameter', y = 'Person')
print(p)

30.2.5 Inspect the model probabilities

30.2.6 What is p1? p0? pD?

head(mod1$probs)

            p1           p0           pD
[1,] 0.9963473 0.0036526689 6.100379e-45
[2,] 0.9998848 0.0001151765 1.085183e-45
[3,] 0.7509258 0.2490741530 4.373299e-44
[4,] 0.9906881 0.0093119089 9.712569e-45
[5,] 0.9099456 0.0900544355 2.894720e-44
[6,] 0.9959516 0.0040483851 6.421054e-45

30.3 Extension exercises

Try some less accurate judging. Note the impact on the model parameters.
Can you recreate the model probabilities?