We also include other information useful for psycholinguists - namely, word frequency databases from van Heuven et al. (2014).
library(tidyverse)
Load data.
# freq taken from:
# van Heuven, W. J. B., Mandera, P., Keuleers, E., & Brysbaert, M. (2014). SUBTLEX-UK: A new and improved word frequency database for British English. The Quarterly Journal of Experimental Psychology, 67(6), 1176–1190. https://doi.org/10.1080/17470218.2013.850521
# see here: https://osf.io/zq49t (the original file is in xlsx, we save it as csv)
freq <- read_table("subtlex_uk.csv")
##
## ── Column specification ──────────────────────────────────────────────────
## cols(
## .default = col_double(),
## Spelling = col_character(),
## DomPoS = col_character(),
## DomPoSLemma = col_character(),
## AllPoS = col_character(),
## AllPoSFreq = col_character(),
## Spell_check = col_character(),
## Homophonic_entry = col_character(),
## Double_word_entry = col_character(),
## Freq_double_entry = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 148414 parsing failures.
## row col expected actual file
## 1 -- 27 columns 25 columns 'subtlex_uk.csv'
## 3 -- 27 columns 25 columns 'subtlex_uk.csv'
## 8 -- 27 columns 25 columns 'subtlex_uk.csv'
## 9 -- 27 columns 28 columns 'subtlex_uk.csv'
## 10 -- 27 columns 25 columns 'subtlex_uk.csv'
## ... ... .......... .......... ................
## See problems(...) for more details.
spr <- read_table("frank_etal/selfpacedreading.RT.txt")
##
## ── Column specification ──────────────────────────────────────────────────
## cols(
## subj_nr = col_double(),
## sent_nr = col_double(),
## sent_pos = col_double(),
## correct = col_character(),
## answer_time = col_character(),
## word_pos = col_double(),
## word = col_character(),
## RT = col_double()
## )
et <- read_table("frank_etal/eyetracking.RT.txt")
##
## ── Column specification ──────────────────────────────────────────────────
## cols(
## subj_nr = col_double(),
## sent_nr = col_double(),
## sent_pos = col_double(),
## correct = col_character(),
## answer_time = col_character(),
## word_pos = col_double(),
## word = col_character(),
## RTfirstfix = col_double(),
## RTfirstpass = col_double(),
## RTrightbound = col_double(),
## RTgopast = col_double()
## )
print(freq, n=10)
## # A tibble: 160,022 × 27
## Spelling FreqCount Cbeebies_freq CBBC_freq BNC_freq `LogFreq(Zipf)`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 labour 45591 1 81 27108 5.35
## 2 programme 31950 42 731 19071 5.2
## 3 favourite 27052 2015 3481 4769 5.13
## 4 colour 22651 1665 1789 11541 5.05
## 5 realise 15715 117 910 3904 4.89
## 6 honourable 13956 2 28 887 4.84
## 7 metres 12695 20 1459 3390 4.8
## 8 flavour 10745 160 472 1494 4.73
## 9 colours 9924 968 878 4463 4.69
## 10 recognise 9913 82 446 3648 4.69
## # ℹ 160,012 more rows
## # ℹ 21 more variables: `LogFreqCbeebies(Zipf)` <dbl>,
## # `LogFreqCBBC(Zipf)` <dbl>, `LogFreqBNC(Zipf)` <dbl>, CD_count <dbl>,
## # CD_count_Cbeebies <dbl>, CD_count_CBBC <dbl>, CD <dbl>,
## # CD_cbeebies <dbl>, CD_cbbc <dbl>, DomPoS <chr>, DomPoSLemma <chr>,
## # DomPoSFreq <dbl>, DomPoSLemmaTotalFreq <dbl>, AllPoS <chr>,
## # AllPoSFreq <chr>, CapitFreq <dbl>, Spell_check <chr>, Hyphen <dbl>, …
print(spr, n=10)
## # A tibble: 353,584 × 8
## subj_nr sent_nr sent_pos correct answer_time word_pos word RT
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 2 12 c 3630 1 Billy 376
## 2 1 2 12 c 3630 2 wrote 364
## 3 1 2 12 c 3630 3 on 394
## 4 1 2 12 c 3630 4 the 353
## 5 1 2 12 c 3630 5 envelope. 354
## 6 1 3 32 - NaN 1 He 354
## 7 1 3 32 - NaN 2 called 449
## 8 1 3 32 - NaN 3 over 409
## 9 1 3 32 - NaN 4 his 362
## 10 1 3 32 - NaN 5 shoulder. 361
## # ℹ 353,574 more rows
print(et, n=10)
## # A tibble: 81,109 × 11
## subj_nr sent_nr sent_pos correct answer_time word_pos word RTfirstfix
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 1 127 - NaN 1 Anne 216
## 2 1 1 127 - NaN 2 lost 152
## 3 1 1 127 - NaN 3 contr… 144
## 4 1 1 127 - NaN 4 and 184
## 5 1 1 127 - NaN 5 laugh… 244
## 6 1 2 150 c 2106 1 Billy 272
## 7 1 2 150 c 2106 2 wrote 160
## 8 1 2 150 c 2106 3 on 280
## 9 1 2 150 c 2106 4 the 0
## 10 1 2 150 c 2106 5 envel… 264
## # ℹ 81,099 more rows
## # ℹ 3 more variables: RTfirstpass <dbl>, RTrightbound <dbl>,
## # RTgopast <dbl>
spr$exp <- "spr"
et$exp <- "et"
spr$RTfirstfix <- NA
spr$RTfirstpass <- NA
spr$RTrightbound <- NA
spr$RTgopast <- NA
et$RT <- NA
How many sentences did people read in et (eye tracking) and spr (self-paced reading)?
print(et %>% group_by(subj_nr) %>% summarise(n_sent = length(levels(as.factor(sent_nr)))), n=20)
## # A tibble: 43 × 2
## subj_nr n_sent
## <dbl> <int>
## 1 1 205
## 2 2 205
## 3 3 205
## 4 4 205
## 5 5 205
## 6 6 205
## 7 7 205
## 8 8 204
## 9 9 35
## 10 10 205
## 11 11 205
## 12 12 205
## 13 13 205
## 14 14 205
## 15 15 205
## 16 16 205
## 17 17 175
## 18 18 205
## 19 19 205
## 20 20 205
## # ℹ 23 more rows
print(et %>% group_by(subj_nr) %>% summarise(n_sent = length(levels(as.factor(sent_nr)))) %>% ungroup() %>% summarise(mean(n_sent)), n=20)
## # A tibble: 1 × 1
## `mean(n_sent)`
## <dbl>
## 1 200.
print(spr %>% group_by(subj_nr) %>% summarise(n_sent = length(levels(as.factor(sent_nr))), mean(n_sent)), n=20)
## # A tibble: 117 × 3
## subj_nr n_sent `mean(n_sent)`
## <dbl> <int> <dbl>
## 1 1 290 290
## 2 2 237 237
## 3 3 143 143
## 4 4 177 177
## 5 5 120 120
## 6 6 230 230
## 7 7 152 152
## 8 8 167 167
## 9 9 171 171
## 10 10 214 214
## 11 11 264 264
## 12 12 212 212
## 13 13 314 314
## 14 14 205 205
## 15 15 316 316
## 16 16 256 256
## 17 17 361 361
## 18 18 222 222
## 19 19 132 132
## 20 20 243 243
## # ℹ 97 more rows
print(spr %>% group_by(subj_nr) %>% summarise(n_sent = length(levels(as.factor(sent_nr)))) %>% ungroup() %>% summarise(mean(n_sent)), n=20)
## # A tibble: 1 × 1
## `mean(n_sent)`
## <dbl>
## 1 219.
How many participants read each sentence in et and spr?
print(et %>% group_by(sent_nr) %>% summarise(n_subj = length(levels(as.factor(subj_nr)))), n=Inf)
## # A tibble: 205 × 2
## sent_nr n_subj
## <dbl> <int>
## 1 1 43
## 2 2 42
## 3 3 42
## 4 4 40
## 5 5 42
## 6 6 42
## 7 7 41
## 8 8 41
## 9 9 42
## 10 10 41
## 11 11 42
## 12 12 43
## 13 13 42
## 14 14 41
## 15 15 42
## 16 16 43
## 17 17 42
## 18 18 42
## 19 19 42
## 20 20 41
## 21 21 42
## 22 22 41
## 23 23 41
## 24 24 42
## 25 25 41
## 26 26 42
## 27 27 42
## 28 28 42
## 29 29 42
## 30 30 43
## 31 31 42
## 32 32 41
## 33 33 41
## 34 34 42
## 35 35 42
## 36 36 42
## 37 37 41
## 38 38 42
## 39 39 42
## 40 40 42
## 41 41 42
## 42 42 41
## 43 43 43
## 44 44 42
## 45 45 42
## 46 46 42
## 47 47 42
## 48 48 42
## 49 49 42
## 50 50 42
## 51 51 42
## 52 52 42
## 53 53 42
## 54 54 43
## 55 55 42
## 56 56 42
## 57 57 43
## 58 58 41
## 59 59 42
## 60 60 41
## 61 61 42
## 62 62 43
## 63 63 42
## 64 64 42
## 65 65 43
## 66 66 42
## 67 67 41
## 68 68 42
## 69 69 42
## 70 70 42
## 71 71 41
## 72 72 43
## 73 73 42
## 74 74 42
## 75 75 42
## 76 76 42
## 77 77 42
## 78 78 42
## 79 79 42
## 80 80 42
## 81 81 42
## 82 82 42
## 83 83 42
## 84 84 42
## 85 85 43
## 86 86 42
## 87 87 43
## 88 88 41
## 89 89 41
## 90 90 42
## 91 91 43
## 92 92 42
## 93 93 43
## 94 94 42
## 95 95 43
## 96 96 42
## 97 97 42
## 98 98 42
## 99 99 42
## 100 100 42
## 101 101 42
## 102 102 42
## 103 103 42
## 104 104 42
## 105 105 42
## 106 106 42
## 107 107 42
## 108 108 43
## 109 109 42
## 110 110 42
## 111 111 42
## 112 112 42
## 113 113 42
## 114 114 42
## 115 115 42
## 116 116 42
## 117 117 41
## 118 118 41
## 119 119 42
## 120 120 42
## 121 121 42
## 122 122 42
## 123 123 43
## 124 124 43
## 125 125 42
## 126 126 42
## 127 127 42
## 128 128 42
## 129 129 42
## 130 130 41
## 131 131 42
## 132 132 41
## 133 133 41
## 134 134 41
## 135 135 42
## 136 136 42
## 137 137 42
## 138 138 42
## 139 139 43
## 140 140 42
## 141 141 43
## 142 142 42
## 143 143 43
## 144 144 41
## 145 145 42
## 146 146 42
## 147 147 42
## 148 148 43
## 149 149 41
## 150 150 43
## 151 151 42
## 152 152 42
## 153 154 42
## 154 155 42
## 155 156 42
## 156 157 42
## 157 158 41
## 158 159 43
## 159 160 42
## 160 161 42
## 161 162 42
## 162 163 42
## 163 164 42
## 164 165 41
## 165 166 42
## 166 169 42
## 167 170 42
## 168 172 42
## 169 173 42
## 170 174 43
## 171 175 42
## 172 176 43
## 173 177 42
## 174 178 41
## 175 179 42
## 176 181 42
## 177 182 43
## 178 183 42
## 179 185 42
## 180 186 42
## 181 187 42
## 182 188 42
## 183 189 42
## 184 190 42
## 185 192 43
## 186 193 42
## 187 194 42
## 188 195 42
## 189 197 41
## 190 198 42
## 191 199 43
## 192 201 42
## 193 202 42
## 194 209 42
## 195 210 42
## 196 211 42
## 197 221 42
## 198 226 42
## 199 227 42
## 200 228 42
## 201 230 42
## 202 235 41
## 203 239 43
## 204 241 42
## 205 246 42
print(et %>% group_by(sent_nr) %>% summarise(n_subj = length(levels(as.factor(subj_nr)))) %>% ungroup() %>% summarise(mean(n_subj)), n=Inf)
## # A tibble: 1 × 1
## `mean(n_subj)`
## <dbl>
## 1 42.0
print(spr %>% group_by(sent_nr) %>% summarise(n_subj = length(levels(as.factor(subj_nr)))), n=Inf)
## # A tibble: 361 × 2
## sent_nr n_subj
## <dbl> <int>
## 1 1 68
## 2 2 77
## 3 3 67
## 4 4 71
## 5 5 71
## 6 6 69
## 7 7 67
## 8 8 68
## 9 9 78
## 10 10 82
## 11 11 65
## 12 12 67
## 13 13 70
## 14 14 79
## 15 15 67
## 16 16 78
## 17 17 69
## 18 18 69
## 19 19 66
## 20 20 68
## 21 21 74
## 22 22 67
## 23 23 77
## 24 24 69
## 25 25 70
## 26 26 78
## 27 27 65
## 28 28 69
## 29 29 68
## 30 30 76
## 31 31 72
## 32 32 73
## 33 33 74
## 34 34 72
## 35 35 58
## 36 36 72
## 37 37 62
## 38 38 62
## 39 39 76
## 40 40 69
## 41 41 76
## 42 42 70
## 43 43 65
## 44 44 82
## 45 45 69
## 46 46 75
## 47 47 60
## 48 48 67
## 49 49 67
## 50 50 71
## 51 51 76
## 52 52 71
## 53 53 73
## 54 54 74
## 55 55 75
## 56 56 73
## 57 57 70
## 58 58 59
## 59 59 75
## 60 60 83
## 61 61 70
## 62 62 75
## 63 63 72
## 64 64 70
## 65 65 77
## 66 66 70
## 67 67 60
## 68 68 66
## 69 69 71
## 70 70 68
## 71 71 72
## 72 72 66
## 73 73 76
## 74 74 62
## 75 75 74
## 76 76 77
## 77 77 67
## 78 78 68
## 79 79 74
## 80 80 63
## 81 81 78
## 82 82 74
## 83 83 69
## 84 84 61
## 85 85 71
## 86 86 81
## 87 87 73
## 88 88 73
## 89 89 76
## 90 90 74
## 91 91 65
## 92 92 66
## 93 93 60
## 94 94 75
## 95 95 71
## 96 96 78
## 97 97 65
## 98 98 70
## 99 99 72
## 100 100 70
## 101 101 68
## 102 102 68
## 103 103 66
## 104 104 71
## 105 105 77
## 106 106 72
## 107 107 68
## 108 108 66
## 109 109 69
## 110 110 69
## 111 111 74
## 112 112 63
## 113 113 72
## 114 114 67
## 115 115 66
## 116 116 72
## 117 117 66
## 118 118 78
## 119 119 72
## 120 120 71
## 121 121 77
## 122 122 67
## 123 123 69
## 124 124 69
## 125 125 81
## 126 126 78
## 127 127 76
## 128 128 70
## 129 129 67
## 130 130 65
## 131 131 83
## 132 132 62
## 133 133 73
## 134 134 69
## 135 135 68
## 136 136 68
## 137 137 82
## 138 138 68
## 139 139 74
## 140 140 69
## 141 141 62
## 142 142 80
## 143 143 70
## 144 144 73
## 145 145 68
## 146 146 74
## 147 147 65
## 148 148 80
## 149 149 66
## 150 150 77
## 151 151 74
## 152 152 68
## 153 153 63
## 154 154 70
## 155 155 77
## 156 156 67
## 157 157 70
## 158 158 62
## 159 159 75
## 160 160 64
## 161 161 73
## 162 162 58
## 163 163 79
## 164 164 71
## 165 165 71
## 166 166 74
## 167 167 74
## 168 168 68
## 169 169 68
## 170 170 72
## 171 171 73
## 172 172 72
## 173 173 67
## 174 174 66
## 175 175 69
## 176 176 83
## 177 177 67
## 178 178 77
## 179 179 72
## 180 180 66
## 181 181 68
## 182 182 71
## 183 183 63
## 184 184 72
## 185 185 74
## 186 186 70
## 187 187 77
## 188 188 80
## 189 189 71
## 190 190 77
## 191 191 67
## 192 192 70
## 193 193 66
## 194 194 78
## 195 195 79
## 196 196 65
## 197 197 75
## 198 198 68
## 199 199 73
## 200 200 83
## 201 201 67
## 202 202 70
## 203 203 67
## 204 204 74
## 205 205 75
## 206 206 67
## 207 207 71
## 208 208 69
## 209 209 75
## 210 210 68
## 211 211 70
## 212 212 73
## 213 213 74
## 214 214 72
## 215 215 63
## 216 216 68
## 217 217 68
## 218 218 70
## 219 219 79
## 220 220 68
## 221 221 69
## 222 222 68
## 223 223 80
## 224 224 76
## 225 225 70
## 226 226 69
## 227 227 77
## 228 228 66
## 229 229 68
## 230 230 81
## 231 231 72
## 232 232 66
## 233 233 78
## 234 234 73
## 235 235 70
## 236 236 71
## 237 237 68
## 238 238 70
## 239 239 76
## 240 240 69
## 241 241 77
## 242 242 73
## 243 243 69
## 244 244 74
## 245 245 77
## 246 246 70
## 247 247 74
## 248 248 66
## 249 249 78
## 250 250 64
## 251 251 71
## 252 252 72
## 253 253 74
## 254 254 66
## 255 255 66
## 256 256 79
## 257 257 70
## 258 258 70
## 259 259 76
## 260 260 72
## 261 261 73
## 262 262 67
## 263 263 72
## 264 264 70
## 265 265 74
## 266 266 67
## 267 267 73
## 268 268 68
## 269 269 69
## 270 270 63
## 271 271 77
## 272 272 74
## 273 273 75
## 274 274 67
## 275 275 69
## 276 276 63
## 277 277 73
## 278 278 77
## 279 279 65
## 280 280 75
## 281 281 69
## 282 282 75
## 283 283 71
## 284 284 69
## 285 285 65
## 286 286 75
## 287 287 65
## 288 288 73
## 289 289 78
## 290 290 71
## 291 291 67
## 292 292 68
## 293 293 67
## 294 294 71
## 295 295 69
## 296 296 63
## 297 297 72
## 298 298 76
## 299 299 69
## 300 300 72
## 301 301 76
## 302 302 75
## 303 303 76
## 304 304 69
## 305 305 67
## 306 306 76
## 307 307 74
## 308 308 72
## 309 309 72
## 310 310 73
## 311 311 73
## 312 312 68
## 313 313 66
## 314 314 69
## 315 315 82
## 316 316 58
## 317 317 67
## 318 318 67
## 319 319 65
## 320 320 69
## 321 321 75
## 322 322 75
## 323 323 78
## 324 324 68
## 325 325 75
## 326 326 67
## 327 327 80
## 328 328 76
## 329 329 76
## 330 330 77
## 331 331 70
## 332 332 64
## 333 333 59
## 334 334 71
## 335 335 71
## 336 336 73
## 337 337 78
## 338 338 75
## 339 339 75
## 340 340 72
## 341 341 74
## 342 342 69
## 343 343 70
## 344 344 74
## 345 345 75
## 346 346 67
## 347 347 71
## 348 348 78
## 349 349 73
## 350 350 64
## 351 351 84
## 352 352 70
## 353 353 78
## 354 354 76
## 355 355 70
## 356 356 79
## 357 357 70
## 358 358 75
## 359 359 68
## 360 360 72
## 361 361 75
print(spr %>% group_by(sent_nr) %>% summarise(n_subj = length(levels(as.factor(subj_nr)))) %>% ungroup() %>% summarise(mean(n_subj)), n=Inf)
## # A tibble: 1 × 1
## `mean(n_subj)`
## <dbl>
## 1 71.1
Preparation for merging with frequency measures and with PMB.
et <- et %>% mutate(subj_nr=paste("ET", as.character(subj_nr), sep="_"))
reading <- rbind(spr, et)
# remove punctuation and do lower case for merging purposes
reading$wordstripped <- gsub("[[:punct:]]", "", reading$word)
reading$Spelling <- tolower(reading$wordstripped)
reading <- left_join(reading, freq, by="Spelling")
reading <- reading %>% select(-c("Spelling"))
reading$logfreqscaled <- scale(log(reading$FreqCount))[,1]
Load pmb.
pmb <- read_tsv("data/merged/sbn_combined.tsv")
## Rows: 4678 Columns: 5
## ── Column specification ──────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sense, Operator, Word, DocID
## dbl (1): sent_nr
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(pmb, n=10, width=Inf)
## # A tibble: 4,678 × 5
## Sense
## <chr>
## 1 Damian is not normally a man of action so now, stirred into action, he…
## 2 male.n.02
## 3 -
## 4 be.v.01
## 5 time.n.08
## 6 normally.r.01
## 7 man.n.01
## 8 action.n.06
## 9 -
## 10 entity.n.01
## Operator Word
## <chr> <chr>
## 1 "-" FULL_SENTENCE
## 2 "Name \"Damian\"" % Damian [0-6]
## 3 "NEGATION <1" %
## 4 "Theme -1 Time +1 Manner +2 Co-Theme +3" % is --not-- [7-13]
## 5 "EQU now" %
## 6 "-" % normally [14-22]
## 7 "Attribute +1" % --a-- man --of-- [23-31]
## 8 "-" % action --so-- [32-41]
## 9 "EXPLANATION <1" %
## 10 "-" %
## DocID sent_nr
## <chr> <dbl>
## 1 d0352_p31.txt 286
## 2 d0352_p31.txt 286
## 3 d0352_p31.txt 286
## 4 d0352_p31.txt 286
## 5 d0352_p31.txt 286
## 6 d0352_p31.txt 286
## 7 d0352_p31.txt 286
## 8 d0352_p31.txt 286
## 9 d0352_p31.txt 286
## 10 d0352_p31.txt 286
## # ℹ 4,668 more rows
tail(pmb, n=20)
## # A tibble: 20 × 5
## Sense Operator Word DocID sent_nr
## <chr> <chr> <chr> <chr> <dbl>
## 1 I am touched deeply in places. - FULL… d034… 22
## 2 person.n.01 EQU spe… % I … d034… 22
## 3 time.n.08 EQU now % am… d034… 22
## 4 touch.v.01 Experie… % to… d034… 22
## 5 deeply.r.01 - % de… d034… 22
## 6 place.n.03 - % pl… d034… 22
## 7 He held his mother tight and she cried in… - FULL… d035… 133
## 8 male.n.02 - % He… d035… 133
## 9 hold.v.02 Agent -… % he… d035… 133
## 10 time.n.08 TPR now % d035… 133
## 11 male.n.02 ANA -3 % hi… d035… 133
## 12 person.n.01 Role +1 % mo… d035… 133
## 13 mother.n.01 Of -2 % d035… 133
## 14 tight.a.01 - % ti… d035… 133
## 15 - CONTINU… % d035… 133
## 16 female.n.02 ANA -3 % sh… d035… 133
## 17 cry.v.02 Agent -… % cr… d035… 133
## 18 time.n.08 TPR now % d035… 133
## 19 male.n.02 ANA -10 % hi… d035… 133
## 20 chest.n.01 PartOf … % ch… d035… 133
We now merge the two datasets.
# the word info in pmb is stored can consist of up to several words; split those
pmb <- pmb %>% group_by(Word) %>% mutate(Word1 = strsplit(Word, "\\s+")[[1]][2], , Word2 = strsplit(Word, "\\s+")[[1]][3], Word3 = strsplit(Word, "\\s+")[[1]][4], Word4 = strsplit(Word, "\\s+")[[1]][5], Word5 = strsplit(Word, "\\s+")[[1]][6])
# store character position info, matching pattern [n-n] in CharactersOrder column
pmb <- pmb %>% mutate(
CharactersOrder = case_when(
grepl("^\\[\\d+-\\d+\\]$", Word1) ~ Word1,
grepl("^\\[\\d+-\\d+\\]$", Word2) ~ Word2,
grepl("^\\[\\d+-\\d+\\]$", Word3) ~ Word3,
grepl("^\\[\\d+-\\d+\\]$", Word4) ~ Word4,
grepl("^\\[\\d+-\\d+\\]$", Word5) ~ Word5,
TRUE ~ NA_character_
),
Word1 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word1), NA, Word1),
Word2 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word2), NA, Word2),
Word3 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word3), NA, Word3),
Word4 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word4), NA, Word4),
Word5 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word5), NA, Word5)
)
# some words are the core of the meaning; these are marked as not dashed in the Word column; we choose those words as CoreWord
pmb <- pmb %>% mutate(CoreWord =
case_when(
!grepl("-", Word1) ~ Word1,
!grepl("-", Word2) ~ Word2,
!grepl("-", Word3) ~ Word3,
!grepl("-", Word4) ~ Word4,
!grepl("-", Word5) ~ Word5,
T ~ NA))
print(pmb, width=Inf)
## # A tibble: 4,678 × 12
## # Groups: Word [3,127]
## Sense
## <chr>
## 1 Damian is not normally a man of action so now, stirred into action, he…
## 2 male.n.02
## 3 -
## 4 be.v.01
## 5 time.n.08
## 6 normally.r.01
## 7 man.n.01
## 8 action.n.06
## 9 -
## 10 entity.n.01
## Operator Word
## <chr> <chr>
## 1 "-" FULL_SENTENCE
## 2 "Name \"Damian\"" % Damian [0-6]
## 3 "NEGATION <1" %
## 4 "Theme -1 Time +1 Manner +2 Co-Theme +3" % is --not-- [7-13]
## 5 "EQU now" %
## 6 "-" % normally [14-22]
## 7 "Attribute +1" % --a-- man --of-- [23-31]
## 8 "-" % action --so-- [32-41]
## 9 "EXPLANATION <1" %
## 10 "-" %
## DocID sent_nr Word1 Word2 Word3 Word4 Word5
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 d0352_p31.txt 286 <NA> <NA> <NA> <NA> <NA>
## 2 d0352_p31.txt 286 Damian <NA> <NA> <NA> <NA>
## 3 d0352_p31.txt 286 <NA> <NA> <NA> <NA> <NA>
## 4 d0352_p31.txt 286 is --not-- <NA> <NA> <NA>
## 5 d0352_p31.txt 286 <NA> <NA> <NA> <NA> <NA>
## 6 d0352_p31.txt 286 normally <NA> <NA> <NA> <NA>
## 7 d0352_p31.txt 286 --a-- man --of-- <NA> <NA>
## 8 d0352_p31.txt 286 action --so-- <NA> <NA> <NA>
## 9 d0352_p31.txt 286 <NA> <NA> <NA> <NA> <NA>
## 10 d0352_p31.txt 286 <NA> <NA> <NA> <NA> <NA>
## CharactersOrder CoreWord
## <chr> <chr>
## 1 <NA> <NA>
## 2 [0-6] Damian
## 3 <NA> <NA>
## 4 [7-13] is
## 5 <NA> <NA>
## 6 [14-22] normally
## 7 [23-31] man
## 8 [32-41] action
## 9 <NA> <NA>
## 10 <NA> <NA>
## # ℹ 4,668 more rows
# uniquepos ensures that we merge the right token (in case more than one token appears in the same sentence, they are ordered).
reading <- reading %>% group_by(subj_nr, sent_nr, wordstripped) %>% mutate(uniquepos=1:length(wordstripped)) %>% ungroup()
print(filter(reading, sent_nr==100 & exp=="et"), n=10, width=Inf) # the appears 2x in the sentence, is labeled 1 and 2 by uniquepos; note that this is even though the subject did not attend those words in ET (RTs are 0)
## # A tibble: 378 × 42
## subj_nr sent_nr sent_pos correct answer_time word_pos word RT
## <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 ET_1 100 62 c 3058 1 Andrew NA
## 2 ET_1 100 62 c 3058 2 closed NA
## 3 ET_1 100 62 c 3058 3 the NA
## 4 ET_1 100 62 c 3058 4 office NA
## 5 ET_1 100 62 c 3058 5 door NA
## 6 ET_1 100 62 c 3058 6 on NA
## 7 ET_1 100 62 c 3058 7 the NA
## 8 ET_1 100 62 c 3058 8 way NA
## 9 ET_1 100 62 c 3058 9 out. NA
## 10 ET_2 100 64 c 1999 1 Andrew NA
## exp RTfirstfix RTfirstpass RTrightbound RTgopast wordstripped
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 et 272 272 272 272 Andrew
## 2 et 136 272 272 272 closed
## 3 et 0 0 0 0 the
## 4 et 148 148 148 148 office
## 5 et 228 228 228 228 door
## 6 et 172 172 172 172 on
## 7 et 0 0 0 0 the
## 8 et 320 320 320 320 way
## 9 et 172 172 172 172 out
## 10 et 220 220 220 220 Andrew
## FreqCount Cbeebies_freq CBBC_freq BNC_freq `LogFreq(Zipf)`
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14860 53 410 4490 4.87
## 2 9941 185 439 9877 4.69
## 3 9418422 186232 512056 6187927 7.67
## 4 27400 183 737 25684 5.13
## 5 36644 1498 3027 25365 5.26
## 6 1569081 43153 108881 724197 6.89
## 7 9418422 186232 512056 6187927 7.67
## 8 264662 9916 18404 96280 6.12
## 9 611911 17242 50842 155352 6.48
## 10 14860 53 410 4490 4.87
## `LogFreqCbeebies(Zipf)` `LogFreqCBBC(Zipf)` `LogFreqBNC(Zipf)` CD_count
## <dbl> <dbl> <dbl> <dbl>
## 1 3.96 4.48 4.65 4618
## 2 4.5 4.51 4.99 7075
## 3 7.5 7.57 7.79 44361
## 4 4.5 4.73 5.41 12090
## 5 5.41 5.35 5.4 15360
## 6 6.87 6.9 6.86 43989
## 7 7.5 7.57 7.79 44361
## 8 6.23 6.13 5.98 40373
## 9 6.47 6.57 6.19 42763
## 10 3.96 4.48 4.65 4618
## CD_count_Cbeebies CD_count_CBBC CD CD_cbeebies CD_cbbc DomPoS
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 19 95 0.102 0.004 0.02 name
## 2 136 344 0.157 0.028 0.071 verb
## 3 4692 4756 0.984 0.968 0.983 determiner
## 4 74 476 0.268 0.015 0.098 noun
## 5 783 1315 0.341 0.162 0.272 noun
## 6 4498 4684 0.975 0.928 0.968 preposition
## 7 4692 4756 0.984 0.968 0.983 determiner
## 8 3062 4239 0.895 0.632 0.876 noun
## 9 3829 4590 0.948 0.79 0.948 preposition
## 10 19 95 0.102 0.004 0.02 name
## DomPoSLemma DomPoSFreq DomPoSLemmaTotalFreq
## <chr> <dbl> <dbl>
## 1 andrew 14857 14857
## 2 close 8586 20167
## 3 the 9412235 9412235
## 4 office 19160 21625
## 5 door 35708 46544
## 6 on 1566464 1566464
## 7 the 9412235 9412235
## 8 way 262353 281221
## 9 out 580544 580544
## 10 andrew 14857 14857
## AllPoS
## <chr>
## 1 .name.verb.noun.
## 2 .verb.adjective.name.
## 3 .determiner.name.verb.noun.adverb.number.adjective.
## 4 .noun.name.
## 5 .noun.name.
## 6 .preposition.adverb.name.adjective.verb.
## 7 .determiner.name.verb.noun.adverb.number.adjective.
## 8 .noun.adverb.name.
## 9 .preposition.adverb.adjective.noun.name.verb.
## 10 .name.verb.noun.
## AllPoSFreq CapitFreq Spell_check Hyphen
## <chr> <dbl> <chr> <dbl>
## 1 .14857.2.1. 14857 X 0
## 2 .8586.1346.9. 152 UKUS 0
## 3 .9412235.6164.17.2.2.1.1. 946828 UKUS 0
## 4 .19160.8240. 8253 UKUS 0
## 5 .35708.936. 1228 UKUS 0
## 6 .1566464.2291.324.1.1. 54685 UKUS 0
## 7 .9412235.6164.17.2.2.1.1. 946828 UKUS 0
## 8 .262353.1194.1115. 3379 UKUS 0
## 9 .580544.30770.216.189.184.8. 9472 UKUS 0
## 10 .14857.2.1. 14857 X 0
## Homophonic_entry Double_word_entry Freq_double_entry logfreqscaled
## <chr> <chr> <chr> <dbl>
## 1 <NA> <NA> <NA> -0.871
## 2 <NA> <NA> <NA> -1.01
## 3 <NA> <NA> <NA> 1.44
## 4 <NA> <NA> <NA> -0.651
## 5 <NA> <NA> <NA> -0.547
## 6 <NA> <NA> <NA> 0.798
## 7 <NA> <NA> <NA> 1.44
## 8 way, weigh, whey 0.161
## 9 <NA> <NA> <NA> 0.461
## 10 <NA> <NA> <NA> -0.871
## uniquepos
## <int>
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 2
## 8 1
## 9 1
## 10 1
## # ℹ 368 more rows
# We currently merge by CoreWord; another option is to merge by Word1, Word2, or Word3
pmb <- pmb %>% group_by(sent_nr, CoreWord) %>% mutate(uniquepos=1:length(CoreWord)) %>% ungroup()
print(filter(pmb, sent_nr==100))
## # A tibble: 8 × 13
## Sense Operator Word DocID sent_nr Word1 Word2 Word3 Word4 Word5
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Andrew close… "-" FULL… d035… 100 <NA> <NA> <NA> <NA> <NA>
## 2 male.n.02 "Name \… % An… d035… 100 Andr… <NA> <NA> <NA> <NA>
## 3 close.v.01 "Agent … % cl… d035… 100 clos… <NA> <NA> <NA> <NA>
## 4 time.n.08 "TPR no… % d035… 100 <NA> <NA> <NA> <NA> <NA>
## 5 office.n.01 "Attrib… % --… d035… 100 --th… offi… <NA> <NA> <NA>
## 6 door.n.01 "-" % do… d035… 100 door --on… <NA> <NA> <NA>
## 7 way.n.07 "-" % --… d035… 100 --th… way <NA> <NA> <NA>
## 8 out.a.06 "Attrib… % ou… d035… 100 out --.-- <NA> <NA> <NA>
## # ℹ 3 more variables: CharactersOrder <chr>, CoreWord <chr>,
## # uniquepos <int>
pmb$wordstripped <- pmb$CoreWord
merged <- inner_join(pmb, reading, by=c("sent_nr", "wordstripped", "uniquepos"))
# If you use Word1, Word2 and Word3 together, then uncomment this part
#pmb <- pmb %>% group_by(sent_nr, Word1) %>% mutate(uniquepos1=1:length(Word1)) %>% ungroup()
#pmb <- pmb %>% group_by(sent_nr, Word2) %>% mutate(uniquepos2=1:length(Word2)) %>% ungroup()
#pmb <- pmb %>% group_by(sent_nr, Word3) %>% mutate(uniquepos3=1:length(Word2)) %>% ungroup()
#pmb$wordstripped <- pmb$Word1
# uniquepos1 is used also for match
#pmb$uniquepos <- pmb$uniquepos1
#part1 <- inner_join(pmb, reading, by=c("sent_nr", "wordstripped", "uniquepos"))
#part1$basedonword <- 1
#print(part1, width=Inf)
#pmb$wordstripped <- pmb$Word2
# uniquepos2 is used also for match
#pmb$uniquepos <- pmb$uniquepos2
#part2 <- inner_join(pmb, reading, by=c("sent_nr", "wordstripped", "uniquepos"))
#part2$basedonword <- 2
#pmb$wordstripped <- pmb$Word3
# uniquepos3 is used also for match
#pmb$uniquepos <- pmb$uniquepos3
#part3 <- inner_join(pmb, reading, by=c("sent_nr", "wordstripped", "uniquepos"))
#part3$basedonword <- 3
#merged <- rbind(part1, part2, part3)
print(merged, width=Inf)
## # A tibble: 276,692 × 53
## Sense Operator Word DocID sent_nr
## <chr> <chr> <chr> <chr> <dbl>
## 1 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## 2 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## 3 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## 4 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## 5 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## 6 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## 7 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## 8 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## 9 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## 10 male.n.02 "Name \"Damian\"" % Damian [0-6] d0352_p31.txt 286
## Word1 Word2 Word3 Word4 Word5 CharactersOrder CoreWord uniquepos
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## 2 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## 3 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## 4 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## 5 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## 6 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## 7 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## 8 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## 9 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## 10 Damian <NA> <NA> <NA> <NA> [0-6] Damian 1
## wordstripped subj_nr sent_pos correct answer_time word_pos word RT
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 Damian 1 254 - NaN 1 Damian 179
## 2 Damian 4 82 - NaN 1 Damian 185
## 3 Damian 9 39 - NaN 1 Damian 249
## 4 Damian 10 186 - NaN 1 Damian 169
## 5 Damian 11 59 - NaN 1 Damian 202
## 6 Damian 13 89 - NaN 1 Damian 208
## 7 Damian 14 7 - NaN 1 Damian 826
## 8 Damian 15 170 - NaN 1 Damian 176
## 9 Damian 16 55 - NaN 1 Damian 234
## 10 Damian 17 173 - NaN 1 Damian 233
## exp RTfirstfix RTfirstpass RTrightbound RTgopast FreqCount
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 spr NA NA NA NA 833
## 2 spr NA NA NA NA 833
## 3 spr NA NA NA NA 833
## 4 spr NA NA NA NA 833
## 5 spr NA NA NA NA 833
## 6 spr NA NA NA NA 833
## 7 spr NA NA NA NA 833
## 8 spr NA NA NA NA 833
## 9 spr NA NA NA NA 833
## 10 spr NA NA NA NA 833
## Cbeebies_freq CBBC_freq BNC_freq `LogFreq(Zipf)`
## <dbl> <dbl> <dbl> <dbl>
## 1 16 12 486 3.62
## 2 16 12 486 3.62
## 3 16 12 486 3.62
## 4 16 12 486 3.62
## 5 16 12 486 3.62
## 6 16 12 486 3.62
## 7 16 12 486 3.62
## 8 16 12 486 3.62
## 9 16 12 486 3.62
## 10 16 12 486 3.62
## `LogFreqCbeebies(Zipf)` `LogFreqCBBC(Zipf)` `LogFreqBNC(Zipf)` CD_count
## <dbl> <dbl> <dbl> <dbl>
## 1 3.46 2.98 3.69 318
## 2 3.46 2.98 3.69 318
## 3 3.46 2.98 3.69 318
## 4 3.46 2.98 3.69 318
## 5 3.46 2.98 3.69 318
## 6 3.46 2.98 3.69 318
## 7 3.46 2.98 3.69 318
## 8 3.46 2.98 3.69 318
## 9 3.46 2.98 3.69 318
## 10 3.46 2.98 3.69 318
## CD_count_Cbeebies CD_count_CBBC CD CD_cbeebies CD_cbbc DomPoS
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2 7 0.007 0 0.001 name
## 2 2 7 0.007 0 0.001 name
## 3 2 7 0.007 0 0.001 name
## 4 2 7 0.007 0 0.001 name
## 5 2 7 0.007 0 0.001 name
## 6 2 7 0.007 0 0.001 name
## 7 2 7 0.007 0 0.001 name
## 8 2 7 0.007 0 0.001 name
## 9 2 7 0.007 0 0.001 name
## 10 2 7 0.007 0 0.001 name
## DomPoSLemma DomPoSFreq DomPoSLemmaTotalFreq AllPoS
## <chr> <dbl> <dbl> <chr>
## 1 damian 704 704 .name.adjective.verb.noun.
## 2 damian 704 704 .name.adjective.verb.noun.
## 3 damian 704 704 .name.adjective.verb.noun.
## 4 damian 704 704 .name.adjective.verb.noun.
## 5 damian 704 704 .name.adjective.verb.noun.
## 6 damian 704 704 .name.adjective.verb.noun.
## 7 damian 704 704 .name.adjective.verb.noun.
## 8 damian 704 704 .name.adjective.verb.noun.
## 9 damian 704 704 .name.adjective.verb.noun.
## 10 damian 704 704 .name.adjective.verb.noun.
## AllPoSFreq CapitFreq Spell_check Hyphen Homophonic_entry
## <chr> <dbl> <chr> <dbl> <chr>
## 1 .704.123.3.3. 828 X 0 <NA>
## 2 .704.123.3.3. 828 X 0 <NA>
## 3 .704.123.3.3. 828 X 0 <NA>
## 4 .704.123.3.3. 828 X 0 <NA>
## 5 .704.123.3.3. 828 X 0 <NA>
## 6 .704.123.3.3. 828 X 0 <NA>
## 7 .704.123.3.3. 828 X 0 <NA>
## 8 .704.123.3.3. 828 X 0 <NA>
## 9 .704.123.3.3. 828 X 0 <NA>
## 10 .704.123.3.3. 828 X 0 <NA>
## Double_word_entry Freq_double_entry logfreqscaled
## <chr> <chr> <dbl>
## 1 <NA> <NA> -1.90
## 2 <NA> <NA> -1.90
## 3 <NA> <NA> -1.90
## 4 <NA> <NA> -1.90
## 5 <NA> <NA> -1.90
## 6 <NA> <NA> -1.90
## 7 <NA> <NA> -1.90
## 8 <NA> <NA> -1.90
## 9 <NA> <NA> -1.90
## 10 <NA> <NA> -1.90
## # ℹ 276,682 more rows
# check that words match and are at the right word_pos
print(subset(merged, sent_nr==100) %>% group_by(word) %>% summarise(first(CoreWord), first(wordstripped), first(word_pos)), width=Inf)
## # A tibble: 6 × 4
## word `first(CoreWord)` `first(wordstripped)` `first(word_pos)`
## <chr> <chr> <chr> <dbl>
## 1 Andrew Andrew Andrew 1
## 2 closed closed closed 2
## 3 door door door 5
## 4 office office office 4
## 5 out. out out 9
## 6 way way way 8
# remove auxiliary columns
merged <- merged %>% select(-c(uniquepos, wordstripped))
# reorder data by sent_nr and word_pos
merged <- merged %>% arrange(exp, subj_nr, sent_nr, word_pos)
We store the results.
write_tsv(merged, "UCL_corpus_with_PMB_and_frequencies.tsv")
We store the same results, but without the frequency information.
merged_small <- merged %>% select(-c(FreqCount:ncol(merged)))
write_tsv(merged_small, "UCL_corpus_with_PMB_without_frequencies.tsv")