We also include other information useful for psycholinguists - namely, word frequency databases from van Heuven et al. (2014).

library(tidyverse)

Load data.

# freq taken from:
# van Heuven, W. J. B., Mandera, P., Keuleers, E., & Brysbaert, M. (2014). SUBTLEX-UK: A new and improved word frequency database for British English. The Quarterly Journal of Experimental Psychology, 67(6), 1176–1190. https://doi.org/10.1080/17470218.2013.850521
# see here: https://osf.io/zq49t (the original file is in xlsx, we save it as csv)

freq <- read_table("subtlex_uk.csv")
## 
## ── Column specification ──────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   Spelling = col_character(),
##   DomPoS = col_character(),
##   DomPoSLemma = col_character(),
##   AllPoS = col_character(),
##   AllPoSFreq = col_character(),
##   Spell_check = col_character(),
##   Homophonic_entry = col_character(),
##   Double_word_entry = col_character(),
##   Freq_double_entry = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 148414 parsing failures.
## row col   expected     actual             file
##   1  -- 27 columns 25 columns 'subtlex_uk.csv'
##   3  -- 27 columns 25 columns 'subtlex_uk.csv'
##   8  -- 27 columns 25 columns 'subtlex_uk.csv'
##   9  -- 27 columns 28 columns 'subtlex_uk.csv'
##  10  -- 27 columns 25 columns 'subtlex_uk.csv'
## ... ... .......... .......... ................
## See problems(...) for more details.
spr <- read_table("frank_etal/selfpacedreading.RT.txt")
## 
## ── Column specification ──────────────────────────────────────────────────
## cols(
##   subj_nr = col_double(),
##   sent_nr = col_double(),
##   sent_pos = col_double(),
##   correct = col_character(),
##   answer_time = col_character(),
##   word_pos = col_double(),
##   word = col_character(),
##   RT = col_double()
## )
et <- read_table("frank_etal/eyetracking.RT.txt")
## 
## ── Column specification ──────────────────────────────────────────────────
## cols(
##   subj_nr = col_double(),
##   sent_nr = col_double(),
##   sent_pos = col_double(),
##   correct = col_character(),
##   answer_time = col_character(),
##   word_pos = col_double(),
##   word = col_character(),
##   RTfirstfix = col_double(),
##   RTfirstpass = col_double(),
##   RTrightbound = col_double(),
##   RTgopast = col_double()
## )
print(freq, n=10)
## # A tibble: 160,022 × 27
##    Spelling   FreqCount Cbeebies_freq CBBC_freq BNC_freq `LogFreq(Zipf)`
##    <chr>          <dbl>         <dbl>     <dbl>    <dbl>           <dbl>
##  1 labour         45591             1        81    27108            5.35
##  2 programme      31950            42       731    19071            5.2 
##  3 favourite      27052          2015      3481     4769            5.13
##  4 colour         22651          1665      1789    11541            5.05
##  5 realise        15715           117       910     3904            4.89
##  6 honourable     13956             2        28      887            4.84
##  7 metres         12695            20      1459     3390            4.8 
##  8 flavour        10745           160       472     1494            4.73
##  9 colours         9924           968       878     4463            4.69
## 10 recognise       9913            82       446     3648            4.69
## # ℹ 160,012 more rows
## # ℹ 21 more variables: `LogFreqCbeebies(Zipf)` <dbl>,
## #   `LogFreqCBBC(Zipf)` <dbl>, `LogFreqBNC(Zipf)` <dbl>, CD_count <dbl>,
## #   CD_count_Cbeebies <dbl>, CD_count_CBBC <dbl>, CD <dbl>,
## #   CD_cbeebies <dbl>, CD_cbbc <dbl>, DomPoS <chr>, DomPoSLemma <chr>,
## #   DomPoSFreq <dbl>, DomPoSLemmaTotalFreq <dbl>, AllPoS <chr>,
## #   AllPoSFreq <chr>, CapitFreq <dbl>, Spell_check <chr>, Hyphen <dbl>, …
print(spr, n=10)
## # A tibble: 353,584 × 8
##    subj_nr sent_nr sent_pos correct answer_time word_pos word         RT
##      <dbl>   <dbl>    <dbl> <chr>   <chr>          <dbl> <chr>     <dbl>
##  1       1       2       12 c       3630               1 Billy       376
##  2       1       2       12 c       3630               2 wrote       364
##  3       1       2       12 c       3630               3 on          394
##  4       1       2       12 c       3630               4 the         353
##  5       1       2       12 c       3630               5 envelope.   354
##  6       1       3       32 -       NaN                1 He          354
##  7       1       3       32 -       NaN                2 called      449
##  8       1       3       32 -       NaN                3 over        409
##  9       1       3       32 -       NaN                4 his         362
## 10       1       3       32 -       NaN                5 shoulder.   361
## # ℹ 353,574 more rows
print(et, n=10)
## # A tibble: 81,109 × 11
##    subj_nr sent_nr sent_pos correct answer_time word_pos word   RTfirstfix
##      <dbl>   <dbl>    <dbl> <chr>   <chr>          <dbl> <chr>       <dbl>
##  1       1       1      127 -       NaN                1 Anne          216
##  2       1       1      127 -       NaN                2 lost          152
##  3       1       1      127 -       NaN                3 contr…        144
##  4       1       1      127 -       NaN                4 and           184
##  5       1       1      127 -       NaN                5 laugh…        244
##  6       1       2      150 c       2106               1 Billy         272
##  7       1       2      150 c       2106               2 wrote         160
##  8       1       2      150 c       2106               3 on            280
##  9       1       2      150 c       2106               4 the             0
## 10       1       2      150 c       2106               5 envel…        264
## # ℹ 81,099 more rows
## # ℹ 3 more variables: RTfirstpass <dbl>, RTrightbound <dbl>,
## #   RTgopast <dbl>
spr$exp <- "spr"
et$exp <- "et"
spr$RTfirstfix <- NA
spr$RTfirstpass <- NA
spr$RTrightbound <- NA
spr$RTgopast <- NA
et$RT <- NA

How many sentences did people read in et (eye tracking) and spr (self-paced reading)?

print(et %>% group_by(subj_nr) %>% summarise(n_sent = length(levels(as.factor(sent_nr)))), n=20)
## # A tibble: 43 × 2
##    subj_nr n_sent
##      <dbl>  <int>
##  1       1    205
##  2       2    205
##  3       3    205
##  4       4    205
##  5       5    205
##  6       6    205
##  7       7    205
##  8       8    204
##  9       9     35
## 10      10    205
## 11      11    205
## 12      12    205
## 13      13    205
## 14      14    205
## 15      15    205
## 16      16    205
## 17      17    175
## 18      18    205
## 19      19    205
## 20      20    205
## # ℹ 23 more rows
print(et %>% group_by(subj_nr) %>% summarise(n_sent = length(levels(as.factor(sent_nr)))) %>% ungroup() %>% summarise(mean(n_sent)), n=20)
## # A tibble: 1 × 1
##   `mean(n_sent)`
##            <dbl>
## 1           200.
print(spr %>% group_by(subj_nr) %>% summarise(n_sent = length(levels(as.factor(sent_nr))), mean(n_sent)), n=20)
## # A tibble: 117 × 3
##    subj_nr n_sent `mean(n_sent)`
##      <dbl>  <int>          <dbl>
##  1       1    290            290
##  2       2    237            237
##  3       3    143            143
##  4       4    177            177
##  5       5    120            120
##  6       6    230            230
##  7       7    152            152
##  8       8    167            167
##  9       9    171            171
## 10      10    214            214
## 11      11    264            264
## 12      12    212            212
## 13      13    314            314
## 14      14    205            205
## 15      15    316            316
## 16      16    256            256
## 17      17    361            361
## 18      18    222            222
## 19      19    132            132
## 20      20    243            243
## # ℹ 97 more rows
print(spr %>% group_by(subj_nr) %>% summarise(n_sent = length(levels(as.factor(sent_nr)))) %>% ungroup() %>% summarise(mean(n_sent)), n=20)
## # A tibble: 1 × 1
##   `mean(n_sent)`
##            <dbl>
## 1           219.

How many participants read each sentence in et and spr?

print(et %>% group_by(sent_nr) %>% summarise(n_subj = length(levels(as.factor(subj_nr)))), n=Inf)
## # A tibble: 205 × 2
##     sent_nr n_subj
##       <dbl>  <int>
##   1       1     43
##   2       2     42
##   3       3     42
##   4       4     40
##   5       5     42
##   6       6     42
##   7       7     41
##   8       8     41
##   9       9     42
##  10      10     41
##  11      11     42
##  12      12     43
##  13      13     42
##  14      14     41
##  15      15     42
##  16      16     43
##  17      17     42
##  18      18     42
##  19      19     42
##  20      20     41
##  21      21     42
##  22      22     41
##  23      23     41
##  24      24     42
##  25      25     41
##  26      26     42
##  27      27     42
##  28      28     42
##  29      29     42
##  30      30     43
##  31      31     42
##  32      32     41
##  33      33     41
##  34      34     42
##  35      35     42
##  36      36     42
##  37      37     41
##  38      38     42
##  39      39     42
##  40      40     42
##  41      41     42
##  42      42     41
##  43      43     43
##  44      44     42
##  45      45     42
##  46      46     42
##  47      47     42
##  48      48     42
##  49      49     42
##  50      50     42
##  51      51     42
##  52      52     42
##  53      53     42
##  54      54     43
##  55      55     42
##  56      56     42
##  57      57     43
##  58      58     41
##  59      59     42
##  60      60     41
##  61      61     42
##  62      62     43
##  63      63     42
##  64      64     42
##  65      65     43
##  66      66     42
##  67      67     41
##  68      68     42
##  69      69     42
##  70      70     42
##  71      71     41
##  72      72     43
##  73      73     42
##  74      74     42
##  75      75     42
##  76      76     42
##  77      77     42
##  78      78     42
##  79      79     42
##  80      80     42
##  81      81     42
##  82      82     42
##  83      83     42
##  84      84     42
##  85      85     43
##  86      86     42
##  87      87     43
##  88      88     41
##  89      89     41
##  90      90     42
##  91      91     43
##  92      92     42
##  93      93     43
##  94      94     42
##  95      95     43
##  96      96     42
##  97      97     42
##  98      98     42
##  99      99     42
## 100     100     42
## 101     101     42
## 102     102     42
## 103     103     42
## 104     104     42
## 105     105     42
## 106     106     42
## 107     107     42
## 108     108     43
## 109     109     42
## 110     110     42
## 111     111     42
## 112     112     42
## 113     113     42
## 114     114     42
## 115     115     42
## 116     116     42
## 117     117     41
## 118     118     41
## 119     119     42
## 120     120     42
## 121     121     42
## 122     122     42
## 123     123     43
## 124     124     43
## 125     125     42
## 126     126     42
## 127     127     42
## 128     128     42
## 129     129     42
## 130     130     41
## 131     131     42
## 132     132     41
## 133     133     41
## 134     134     41
## 135     135     42
## 136     136     42
## 137     137     42
## 138     138     42
## 139     139     43
## 140     140     42
## 141     141     43
## 142     142     42
## 143     143     43
## 144     144     41
## 145     145     42
## 146     146     42
## 147     147     42
## 148     148     43
## 149     149     41
## 150     150     43
## 151     151     42
## 152     152     42
## 153     154     42
## 154     155     42
## 155     156     42
## 156     157     42
## 157     158     41
## 158     159     43
## 159     160     42
## 160     161     42
## 161     162     42
## 162     163     42
## 163     164     42
## 164     165     41
## 165     166     42
## 166     169     42
## 167     170     42
## 168     172     42
## 169     173     42
## 170     174     43
## 171     175     42
## 172     176     43
## 173     177     42
## 174     178     41
## 175     179     42
## 176     181     42
## 177     182     43
## 178     183     42
## 179     185     42
## 180     186     42
## 181     187     42
## 182     188     42
## 183     189     42
## 184     190     42
## 185     192     43
## 186     193     42
## 187     194     42
## 188     195     42
## 189     197     41
## 190     198     42
## 191     199     43
## 192     201     42
## 193     202     42
## 194     209     42
## 195     210     42
## 196     211     42
## 197     221     42
## 198     226     42
## 199     227     42
## 200     228     42
## 201     230     42
## 202     235     41
## 203     239     43
## 204     241     42
## 205     246     42
print(et %>% group_by(sent_nr) %>% summarise(n_subj = length(levels(as.factor(subj_nr)))) %>% ungroup() %>% summarise(mean(n_subj)), n=Inf)
## # A tibble: 1 × 1
##   `mean(n_subj)`
##            <dbl>
## 1           42.0
print(spr %>% group_by(sent_nr) %>% summarise(n_subj = length(levels(as.factor(subj_nr)))), n=Inf)
## # A tibble: 361 × 2
##     sent_nr n_subj
##       <dbl>  <int>
##   1       1     68
##   2       2     77
##   3       3     67
##   4       4     71
##   5       5     71
##   6       6     69
##   7       7     67
##   8       8     68
##   9       9     78
##  10      10     82
##  11      11     65
##  12      12     67
##  13      13     70
##  14      14     79
##  15      15     67
##  16      16     78
##  17      17     69
##  18      18     69
##  19      19     66
##  20      20     68
##  21      21     74
##  22      22     67
##  23      23     77
##  24      24     69
##  25      25     70
##  26      26     78
##  27      27     65
##  28      28     69
##  29      29     68
##  30      30     76
##  31      31     72
##  32      32     73
##  33      33     74
##  34      34     72
##  35      35     58
##  36      36     72
##  37      37     62
##  38      38     62
##  39      39     76
##  40      40     69
##  41      41     76
##  42      42     70
##  43      43     65
##  44      44     82
##  45      45     69
##  46      46     75
##  47      47     60
##  48      48     67
##  49      49     67
##  50      50     71
##  51      51     76
##  52      52     71
##  53      53     73
##  54      54     74
##  55      55     75
##  56      56     73
##  57      57     70
##  58      58     59
##  59      59     75
##  60      60     83
##  61      61     70
##  62      62     75
##  63      63     72
##  64      64     70
##  65      65     77
##  66      66     70
##  67      67     60
##  68      68     66
##  69      69     71
##  70      70     68
##  71      71     72
##  72      72     66
##  73      73     76
##  74      74     62
##  75      75     74
##  76      76     77
##  77      77     67
##  78      78     68
##  79      79     74
##  80      80     63
##  81      81     78
##  82      82     74
##  83      83     69
##  84      84     61
##  85      85     71
##  86      86     81
##  87      87     73
##  88      88     73
##  89      89     76
##  90      90     74
##  91      91     65
##  92      92     66
##  93      93     60
##  94      94     75
##  95      95     71
##  96      96     78
##  97      97     65
##  98      98     70
##  99      99     72
## 100     100     70
## 101     101     68
## 102     102     68
## 103     103     66
## 104     104     71
## 105     105     77
## 106     106     72
## 107     107     68
## 108     108     66
## 109     109     69
## 110     110     69
## 111     111     74
## 112     112     63
## 113     113     72
## 114     114     67
## 115     115     66
## 116     116     72
## 117     117     66
## 118     118     78
## 119     119     72
## 120     120     71
## 121     121     77
## 122     122     67
## 123     123     69
## 124     124     69
## 125     125     81
## 126     126     78
## 127     127     76
## 128     128     70
## 129     129     67
## 130     130     65
## 131     131     83
## 132     132     62
## 133     133     73
## 134     134     69
## 135     135     68
## 136     136     68
## 137     137     82
## 138     138     68
## 139     139     74
## 140     140     69
## 141     141     62
## 142     142     80
## 143     143     70
## 144     144     73
## 145     145     68
## 146     146     74
## 147     147     65
## 148     148     80
## 149     149     66
## 150     150     77
## 151     151     74
## 152     152     68
## 153     153     63
## 154     154     70
## 155     155     77
## 156     156     67
## 157     157     70
## 158     158     62
## 159     159     75
## 160     160     64
## 161     161     73
## 162     162     58
## 163     163     79
## 164     164     71
## 165     165     71
## 166     166     74
## 167     167     74
## 168     168     68
## 169     169     68
## 170     170     72
## 171     171     73
## 172     172     72
## 173     173     67
## 174     174     66
## 175     175     69
## 176     176     83
## 177     177     67
## 178     178     77
## 179     179     72
## 180     180     66
## 181     181     68
## 182     182     71
## 183     183     63
## 184     184     72
## 185     185     74
## 186     186     70
## 187     187     77
## 188     188     80
## 189     189     71
## 190     190     77
## 191     191     67
## 192     192     70
## 193     193     66
## 194     194     78
## 195     195     79
## 196     196     65
## 197     197     75
## 198     198     68
## 199     199     73
## 200     200     83
## 201     201     67
## 202     202     70
## 203     203     67
## 204     204     74
## 205     205     75
## 206     206     67
## 207     207     71
## 208     208     69
## 209     209     75
## 210     210     68
## 211     211     70
## 212     212     73
## 213     213     74
## 214     214     72
## 215     215     63
## 216     216     68
## 217     217     68
## 218     218     70
## 219     219     79
## 220     220     68
## 221     221     69
## 222     222     68
## 223     223     80
## 224     224     76
## 225     225     70
## 226     226     69
## 227     227     77
## 228     228     66
## 229     229     68
## 230     230     81
## 231     231     72
## 232     232     66
## 233     233     78
## 234     234     73
## 235     235     70
## 236     236     71
## 237     237     68
## 238     238     70
## 239     239     76
## 240     240     69
## 241     241     77
## 242     242     73
## 243     243     69
## 244     244     74
## 245     245     77
## 246     246     70
## 247     247     74
## 248     248     66
## 249     249     78
## 250     250     64
## 251     251     71
## 252     252     72
## 253     253     74
## 254     254     66
## 255     255     66
## 256     256     79
## 257     257     70
## 258     258     70
## 259     259     76
## 260     260     72
## 261     261     73
## 262     262     67
## 263     263     72
## 264     264     70
## 265     265     74
## 266     266     67
## 267     267     73
## 268     268     68
## 269     269     69
## 270     270     63
## 271     271     77
## 272     272     74
## 273     273     75
## 274     274     67
## 275     275     69
## 276     276     63
## 277     277     73
## 278     278     77
## 279     279     65
## 280     280     75
## 281     281     69
## 282     282     75
## 283     283     71
## 284     284     69
## 285     285     65
## 286     286     75
## 287     287     65
## 288     288     73
## 289     289     78
## 290     290     71
## 291     291     67
## 292     292     68
## 293     293     67
## 294     294     71
## 295     295     69
## 296     296     63
## 297     297     72
## 298     298     76
## 299     299     69
## 300     300     72
## 301     301     76
## 302     302     75
## 303     303     76
## 304     304     69
## 305     305     67
## 306     306     76
## 307     307     74
## 308     308     72
## 309     309     72
## 310     310     73
## 311     311     73
## 312     312     68
## 313     313     66
## 314     314     69
## 315     315     82
## 316     316     58
## 317     317     67
## 318     318     67
## 319     319     65
## 320     320     69
## 321     321     75
## 322     322     75
## 323     323     78
## 324     324     68
## 325     325     75
## 326     326     67
## 327     327     80
## 328     328     76
## 329     329     76
## 330     330     77
## 331     331     70
## 332     332     64
## 333     333     59
## 334     334     71
## 335     335     71
## 336     336     73
## 337     337     78
## 338     338     75
## 339     339     75
## 340     340     72
## 341     341     74
## 342     342     69
## 343     343     70
## 344     344     74
## 345     345     75
## 346     346     67
## 347     347     71
## 348     348     78
## 349     349     73
## 350     350     64
## 351     351     84
## 352     352     70
## 353     353     78
## 354     354     76
## 355     355     70
## 356     356     79
## 357     357     70
## 358     358     75
## 359     359     68
## 360     360     72
## 361     361     75
print(spr %>% group_by(sent_nr) %>% summarise(n_subj = length(levels(as.factor(subj_nr)))) %>% ungroup() %>% summarise(mean(n_subj)), n=Inf)
## # A tibble: 1 × 1
##   `mean(n_subj)`
##            <dbl>
## 1           71.1

Preparation for merging with frequency measures and with PMB.

et <- et %>% mutate(subj_nr=paste("ET", as.character(subj_nr), sep="_"))
reading <- rbind(spr, et)
# remove punctuation and do lower case for merging purposes
reading$wordstripped <- gsub("[[:punct:]]", "", reading$word)
reading$Spelling <- tolower(reading$wordstripped)
reading <- left_join(reading, freq, by="Spelling")

reading <- reading %>% select(-c("Spelling"))

reading$logfreqscaled <- scale(log(reading$FreqCount))[,1]

Load pmb.

pmb <- read_tsv("data/merged/sbn_combined.tsv")
## Rows: 4678 Columns: 5
## ── Column specification ──────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sense, Operator, Word, DocID
## dbl (1): sent_nr
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(pmb, n=10, width=Inf)
## # A tibble: 4,678 × 5
##    Sense                                                                  
##    <chr>                                                                  
##  1 Damian is not normally a man of action so now, stirred into action, he…
##  2 male.n.02                                                              
##  3 -                                                                      
##  4 be.v.01                                                                
##  5 time.n.08                                                              
##  6 normally.r.01                                                          
##  7 man.n.01                                                               
##  8 action.n.06                                                            
##  9 -                                                                      
## 10 entity.n.01                                                            
##    Operator                                 Word                          
##    <chr>                                    <chr>                         
##  1 "-"                                      FULL_SENTENCE                 
##  2 "Name \"Damian\""                        % Damian       [0-6]          
##  3 "NEGATION <1"                            %                             
##  4 "Theme -1 Time +1 Manner +2 Co-Theme +3" % is --not--       [7-13]     
##  5 "EQU now"                                %                             
##  6 "-"                                      % normally     [14-22]        
##  7 "Attribute +1"                           % --a-- man --of--     [23-31]
##  8 "-"                                      % action --so--    [32-41]    
##  9 "EXPLANATION <1"                         %                             
## 10 "-"                                      %                             
##    DocID         sent_nr
##    <chr>           <dbl>
##  1 d0352_p31.txt     286
##  2 d0352_p31.txt     286
##  3 d0352_p31.txt     286
##  4 d0352_p31.txt     286
##  5 d0352_p31.txt     286
##  6 d0352_p31.txt     286
##  7 d0352_p31.txt     286
##  8 d0352_p31.txt     286
##  9 d0352_p31.txt     286
## 10 d0352_p31.txt     286
## # ℹ 4,668 more rows
tail(pmb, n=20)
## # A tibble: 20 × 5
##    Sense                                      Operator Word  DocID sent_nr
##    <chr>                                      <chr>    <chr> <chr>   <dbl>
##  1 I am touched deeply in places.             -        FULL… d034…      22
##  2 person.n.01                                EQU spe… % I … d034…      22
##  3 time.n.08                                  EQU now  % am… d034…      22
##  4 touch.v.01                                 Experie… % to… d034…      22
##  5 deeply.r.01                                -        % de… d034…      22
##  6 place.n.03                                 -        % pl… d034…      22
##  7 He held his mother tight and she cried in… -        FULL… d035…     133
##  8 male.n.02                                  -        % He… d035…     133
##  9 hold.v.02                                  Agent -… % he… d035…     133
## 10 time.n.08                                  TPR now  %     d035…     133
## 11 male.n.02                                  ANA -3   % hi… d035…     133
## 12 person.n.01                                Role +1  % mo… d035…     133
## 13 mother.n.01                                Of -2    %     d035…     133
## 14 tight.a.01                                 -        % ti… d035…     133
## 15 -                                          CONTINU… %     d035…     133
## 16 female.n.02                                ANA -3   % sh… d035…     133
## 17 cry.v.02                                   Agent -… % cr… d035…     133
## 18 time.n.08                                  TPR now  %     d035…     133
## 19 male.n.02                                  ANA -10  % hi… d035…     133
## 20 chest.n.01                                 PartOf … % ch… d035…     133

Merging

We now merge the two datasets.

# the word info in pmb is stored can consist of up to several words; split those
pmb <- pmb %>% group_by(Word) %>% mutate(Word1 = strsplit(Word, "\\s+")[[1]][2], , Word2 = strsplit(Word, "\\s+")[[1]][3], Word3 = strsplit(Word, "\\s+")[[1]][4], Word4 = strsplit(Word, "\\s+")[[1]][5], Word5 = strsplit(Word, "\\s+")[[1]][6])

# store character position info, matching pattern [n-n] in CharactersOrder column
pmb <- pmb %>% mutate(
    CharactersOrder = case_when(
        grepl("^\\[\\d+-\\d+\\]$", Word1) ~ Word1,
        grepl("^\\[\\d+-\\d+\\]$", Word2) ~ Word2,
        grepl("^\\[\\d+-\\d+\\]$", Word3) ~ Word3,
        grepl("^\\[\\d+-\\d+\\]$", Word4) ~ Word4,
        grepl("^\\[\\d+-\\d+\\]$", Word5) ~ Word5,
        TRUE ~ NA_character_
    ),
    Word1 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word1), NA, Word1),
    Word2 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word2), NA, Word2),
    Word3 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word3), NA, Word3),
    Word4 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word4), NA, Word4),
    Word5 = ifelse(grepl("^\\[\\d+-\\d+\\]$", Word5), NA, Word5)
)

# some words are the core of the meaning; these are marked as not dashed in the Word column; we choose those words as CoreWord
pmb <- pmb %>% mutate(CoreWord =
    case_when(
    !grepl("-", Word1) ~ Word1,
    !grepl("-", Word2) ~ Word2,
    !grepl("-", Word3) ~ Word3,
    !grepl("-", Word4) ~ Word4,
    !grepl("-", Word5) ~ Word5,
    T ~ NA))

print(pmb, width=Inf)
## # A tibble: 4,678 × 12
## # Groups:   Word [3,127]
##    Sense                                                                  
##    <chr>                                                                  
##  1 Damian is not normally a man of action so now, stirred into action, he…
##  2 male.n.02                                                              
##  3 -                                                                      
##  4 be.v.01                                                                
##  5 time.n.08                                                              
##  6 normally.r.01                                                          
##  7 man.n.01                                                               
##  8 action.n.06                                                            
##  9 -                                                                      
## 10 entity.n.01                                                            
##    Operator                                 Word                          
##    <chr>                                    <chr>                         
##  1 "-"                                      FULL_SENTENCE                 
##  2 "Name \"Damian\""                        % Damian       [0-6]          
##  3 "NEGATION <1"                            %                             
##  4 "Theme -1 Time +1 Manner +2 Co-Theme +3" % is --not--       [7-13]     
##  5 "EQU now"                                %                             
##  6 "-"                                      % normally     [14-22]        
##  7 "Attribute +1"                           % --a-- man --of--     [23-31]
##  8 "-"                                      % action --so--    [32-41]    
##  9 "EXPLANATION <1"                         %                             
## 10 "-"                                      %                             
##    DocID         sent_nr Word1    Word2   Word3  Word4 Word5
##    <chr>           <dbl> <chr>    <chr>   <chr>  <chr> <chr>
##  1 d0352_p31.txt     286 <NA>     <NA>    <NA>   <NA>  <NA> 
##  2 d0352_p31.txt     286 Damian   <NA>    <NA>   <NA>  <NA> 
##  3 d0352_p31.txt     286 <NA>     <NA>    <NA>   <NA>  <NA> 
##  4 d0352_p31.txt     286 is       --not-- <NA>   <NA>  <NA> 
##  5 d0352_p31.txt     286 <NA>     <NA>    <NA>   <NA>  <NA> 
##  6 d0352_p31.txt     286 normally <NA>    <NA>   <NA>  <NA> 
##  7 d0352_p31.txt     286 --a--    man     --of-- <NA>  <NA> 
##  8 d0352_p31.txt     286 action   --so--  <NA>   <NA>  <NA> 
##  9 d0352_p31.txt     286 <NA>     <NA>    <NA>   <NA>  <NA> 
## 10 d0352_p31.txt     286 <NA>     <NA>    <NA>   <NA>  <NA> 
##    CharactersOrder CoreWord
##    <chr>           <chr>   
##  1 <NA>            <NA>    
##  2 [0-6]           Damian  
##  3 <NA>            <NA>    
##  4 [7-13]          is      
##  5 <NA>            <NA>    
##  6 [14-22]         normally
##  7 [23-31]         man     
##  8 [32-41]         action  
##  9 <NA>            <NA>    
## 10 <NA>            <NA>    
## # ℹ 4,668 more rows
# uniquepos ensures that we merge the right token (in case more than one token appears in the same sentence, they are ordered).
reading <- reading %>% group_by(subj_nr, sent_nr, wordstripped) %>% mutate(uniquepos=1:length(wordstripped)) %>% ungroup()

print(filter(reading, sent_nr==100 & exp=="et"), n=10, width=Inf) # the appears 2x in the sentence, is labeled 1 and 2 by uniquepos; note that this is even though the subject did not attend those words in ET (RTs are 0)
## # A tibble: 378 × 42
##    subj_nr sent_nr sent_pos correct answer_time word_pos word      RT
##    <chr>     <dbl>    <dbl> <chr>   <chr>          <dbl> <chr>  <dbl>
##  1 ET_1        100       62 c       3058               1 Andrew    NA
##  2 ET_1        100       62 c       3058               2 closed    NA
##  3 ET_1        100       62 c       3058               3 the       NA
##  4 ET_1        100       62 c       3058               4 office    NA
##  5 ET_1        100       62 c       3058               5 door      NA
##  6 ET_1        100       62 c       3058               6 on        NA
##  7 ET_1        100       62 c       3058               7 the       NA
##  8 ET_1        100       62 c       3058               8 way       NA
##  9 ET_1        100       62 c       3058               9 out.      NA
## 10 ET_2        100       64 c       1999               1 Andrew    NA
##    exp   RTfirstfix RTfirstpass RTrightbound RTgopast wordstripped
##    <chr>      <dbl>       <dbl>        <dbl>    <dbl> <chr>       
##  1 et           272         272          272      272 Andrew      
##  2 et           136         272          272      272 closed      
##  3 et             0           0            0        0 the         
##  4 et           148         148          148      148 office      
##  5 et           228         228          228      228 door        
##  6 et           172         172          172      172 on          
##  7 et             0           0            0        0 the         
##  8 et           320         320          320      320 way         
##  9 et           172         172          172      172 out         
## 10 et           220         220          220      220 Andrew      
##    FreqCount Cbeebies_freq CBBC_freq BNC_freq `LogFreq(Zipf)`
##        <dbl>         <dbl>     <dbl>    <dbl>           <dbl>
##  1     14860            53       410     4490            4.87
##  2      9941           185       439     9877            4.69
##  3   9418422        186232    512056  6187927            7.67
##  4     27400           183       737    25684            5.13
##  5     36644          1498      3027    25365            5.26
##  6   1569081         43153    108881   724197            6.89
##  7   9418422        186232    512056  6187927            7.67
##  8    264662          9916     18404    96280            6.12
##  9    611911         17242     50842   155352            6.48
## 10     14860            53       410     4490            4.87
##    `LogFreqCbeebies(Zipf)` `LogFreqCBBC(Zipf)` `LogFreqBNC(Zipf)` CD_count
##                      <dbl>               <dbl>              <dbl>    <dbl>
##  1                    3.96                4.48               4.65     4618
##  2                    4.5                 4.51               4.99     7075
##  3                    7.5                 7.57               7.79    44361
##  4                    4.5                 4.73               5.41    12090
##  5                    5.41                5.35               5.4     15360
##  6                    6.87                6.9                6.86    43989
##  7                    7.5                 7.57               7.79    44361
##  8                    6.23                6.13               5.98    40373
##  9                    6.47                6.57               6.19    42763
## 10                    3.96                4.48               4.65     4618
##    CD_count_Cbeebies CD_count_CBBC    CD CD_cbeebies CD_cbbc DomPoS     
##                <dbl>         <dbl> <dbl>       <dbl>   <dbl> <chr>      
##  1                19            95 0.102       0.004   0.02  name       
##  2               136           344 0.157       0.028   0.071 verb       
##  3              4692          4756 0.984       0.968   0.983 determiner 
##  4                74           476 0.268       0.015   0.098 noun       
##  5               783          1315 0.341       0.162   0.272 noun       
##  6              4498          4684 0.975       0.928   0.968 preposition
##  7              4692          4756 0.984       0.968   0.983 determiner 
##  8              3062          4239 0.895       0.632   0.876 noun       
##  9              3829          4590 0.948       0.79    0.948 preposition
## 10                19            95 0.102       0.004   0.02  name       
##    DomPoSLemma DomPoSFreq DomPoSLemmaTotalFreq
##    <chr>            <dbl>                <dbl>
##  1 andrew           14857                14857
##  2 close             8586                20167
##  3 the            9412235              9412235
##  4 office           19160                21625
##  5 door             35708                46544
##  6 on             1566464              1566464
##  7 the            9412235              9412235
##  8 way             262353               281221
##  9 out             580544               580544
## 10 andrew           14857                14857
##    AllPoS                                             
##    <chr>                                              
##  1 .name.verb.noun.                                   
##  2 .verb.adjective.name.                              
##  3 .determiner.name.verb.noun.adverb.number.adjective.
##  4 .noun.name.                                        
##  5 .noun.name.                                        
##  6 .preposition.adverb.name.adjective.verb.           
##  7 .determiner.name.verb.noun.adverb.number.adjective.
##  8 .noun.adverb.name.                                 
##  9 .preposition.adverb.adjective.noun.name.verb.      
## 10 .name.verb.noun.                                   
##    AllPoSFreq                   CapitFreq Spell_check Hyphen
##    <chr>                            <dbl> <chr>        <dbl>
##  1 .14857.2.1.                      14857 X                0
##  2 .8586.1346.9.                      152 UKUS             0
##  3 .9412235.6164.17.2.2.1.1.       946828 UKUS             0
##  4 .19160.8240.                      8253 UKUS             0
##  5 .35708.936.                       1228 UKUS             0
##  6 .1566464.2291.324.1.1.           54685 UKUS             0
##  7 .9412235.6164.17.2.2.1.1.       946828 UKUS             0
##  8 .262353.1194.1115.                3379 UKUS             0
##  9 .580544.30770.216.189.184.8.      9472 UKUS             0
## 10 .14857.2.1.                      14857 X                0
##    Homophonic_entry Double_word_entry Freq_double_entry logfreqscaled
##    <chr>            <chr>             <chr>                     <dbl>
##  1 <NA>             <NA>              <NA>                     -0.871
##  2 <NA>             <NA>              <NA>                     -1.01 
##  3 <NA>             <NA>              <NA>                      1.44 
##  4 <NA>             <NA>              <NA>                     -0.651
##  5 <NA>             <NA>              <NA>                     -0.547
##  6 <NA>             <NA>              <NA>                      0.798
##  7 <NA>             <NA>              <NA>                      1.44 
##  8 way,             weigh,            whey                      0.161
##  9 <NA>             <NA>              <NA>                      0.461
## 10 <NA>             <NA>              <NA>                     -0.871
##    uniquepos
##        <int>
##  1         1
##  2         1
##  3         1
##  4         1
##  5         1
##  6         1
##  7         2
##  8         1
##  9         1
## 10         1
## # ℹ 368 more rows
# We currently merge by CoreWord; another option is to merge by Word1, Word2, or Word3
pmb <- pmb %>% group_by(sent_nr, CoreWord) %>% mutate(uniquepos=1:length(CoreWord)) %>% ungroup()
print(filter(pmb, sent_nr==100))
## # A tibble: 8 × 13
##   Sense         Operator Word  DocID sent_nr Word1 Word2 Word3 Word4 Word5
##   <chr>         <chr>    <chr> <chr>   <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Andrew close… "-"      FULL… d035…     100 <NA>  <NA>  <NA>  <NA>  <NA> 
## 2 male.n.02     "Name \… % An… d035…     100 Andr… <NA>  <NA>  <NA>  <NA> 
## 3 close.v.01    "Agent … % cl… d035…     100 clos… <NA>  <NA>  <NA>  <NA> 
## 4 time.n.08     "TPR no… %     d035…     100 <NA>  <NA>  <NA>  <NA>  <NA> 
## 5 office.n.01   "Attrib… % --… d035…     100 --th… offi… <NA>  <NA>  <NA> 
## 6 door.n.01     "-"      % do… d035…     100 door  --on… <NA>  <NA>  <NA> 
## 7 way.n.07      "-"      % --… d035…     100 --th… way   <NA>  <NA>  <NA> 
## 8 out.a.06      "Attrib… % ou… d035…     100 out   --.-- <NA>  <NA>  <NA> 
## # ℹ 3 more variables: CharactersOrder <chr>, CoreWord <chr>,
## #   uniquepos <int>
pmb$wordstripped <- pmb$CoreWord

merged <- inner_join(pmb, reading, by=c("sent_nr", "wordstripped", "uniquepos"))

# If you use Word1, Word2 and Word3 together, then uncomment this part

#pmb <- pmb %>% group_by(sent_nr, Word1) %>% mutate(uniquepos1=1:length(Word1)) %>% ungroup()
#pmb <- pmb %>% group_by(sent_nr, Word2) %>% mutate(uniquepos2=1:length(Word2)) %>% ungroup()
#pmb <- pmb %>% group_by(sent_nr, Word3) %>% mutate(uniquepos3=1:length(Word2)) %>% ungroup()

#pmb$wordstripped <- pmb$Word1

# uniquepos1 is used also for match
#pmb$uniquepos <- pmb$uniquepos1

#part1 <- inner_join(pmb, reading, by=c("sent_nr", "wordstripped", "uniquepos"))

#part1$basedonword <- 1

#print(part1, width=Inf)

#pmb$wordstripped <- pmb$Word2

# uniquepos2 is used also for match
#pmb$uniquepos <- pmb$uniquepos2

#part2 <- inner_join(pmb, reading, by=c("sent_nr", "wordstripped", "uniquepos"))

#part2$basedonword <- 2

#pmb$wordstripped <- pmb$Word3

# uniquepos3 is used also for match
#pmb$uniquepos <- pmb$uniquepos3

#part3 <- inner_join(pmb, reading, by=c("sent_nr", "wordstripped", "uniquepos"))

#part3$basedonword <- 3

#merged <- rbind(part1, part2, part3)

print(merged, width=Inf)
## # A tibble: 276,692 × 53
##    Sense     Operator          Word                 DocID         sent_nr
##    <chr>     <chr>             <chr>                <chr>           <dbl>
##  1 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
##  2 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
##  3 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
##  4 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
##  5 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
##  6 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
##  7 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
##  8 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
##  9 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
## 10 male.n.02 "Name \"Damian\"" % Damian       [0-6] d0352_p31.txt     286
##    Word1  Word2 Word3 Word4 Word5 CharactersOrder CoreWord uniquepos
##    <chr>  <chr> <chr> <chr> <chr> <chr>           <chr>        <int>
##  1 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
##  2 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
##  3 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
##  4 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
##  5 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
##  6 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
##  7 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
##  8 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
##  9 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
## 10 Damian <NA>  <NA>  <NA>  <NA>  [0-6]           Damian           1
##    wordstripped subj_nr sent_pos correct answer_time word_pos word      RT
##    <chr>        <chr>      <dbl> <chr>   <chr>          <dbl> <chr>  <dbl>
##  1 Damian       1            254 -       NaN                1 Damian   179
##  2 Damian       4             82 -       NaN                1 Damian   185
##  3 Damian       9             39 -       NaN                1 Damian   249
##  4 Damian       10           186 -       NaN                1 Damian   169
##  5 Damian       11            59 -       NaN                1 Damian   202
##  6 Damian       13            89 -       NaN                1 Damian   208
##  7 Damian       14             7 -       NaN                1 Damian   826
##  8 Damian       15           170 -       NaN                1 Damian   176
##  9 Damian       16            55 -       NaN                1 Damian   234
## 10 Damian       17           173 -       NaN                1 Damian   233
##    exp   RTfirstfix RTfirstpass RTrightbound RTgopast FreqCount
##    <chr>      <dbl>       <dbl>        <dbl>    <dbl>     <dbl>
##  1 spr           NA          NA           NA       NA       833
##  2 spr           NA          NA           NA       NA       833
##  3 spr           NA          NA           NA       NA       833
##  4 spr           NA          NA           NA       NA       833
##  5 spr           NA          NA           NA       NA       833
##  6 spr           NA          NA           NA       NA       833
##  7 spr           NA          NA           NA       NA       833
##  8 spr           NA          NA           NA       NA       833
##  9 spr           NA          NA           NA       NA       833
## 10 spr           NA          NA           NA       NA       833
##    Cbeebies_freq CBBC_freq BNC_freq `LogFreq(Zipf)`
##            <dbl>     <dbl>    <dbl>           <dbl>
##  1            16        12      486            3.62
##  2            16        12      486            3.62
##  3            16        12      486            3.62
##  4            16        12      486            3.62
##  5            16        12      486            3.62
##  6            16        12      486            3.62
##  7            16        12      486            3.62
##  8            16        12      486            3.62
##  9            16        12      486            3.62
## 10            16        12      486            3.62
##    `LogFreqCbeebies(Zipf)` `LogFreqCBBC(Zipf)` `LogFreqBNC(Zipf)` CD_count
##                      <dbl>               <dbl>              <dbl>    <dbl>
##  1                    3.46                2.98               3.69      318
##  2                    3.46                2.98               3.69      318
##  3                    3.46                2.98               3.69      318
##  4                    3.46                2.98               3.69      318
##  5                    3.46                2.98               3.69      318
##  6                    3.46                2.98               3.69      318
##  7                    3.46                2.98               3.69      318
##  8                    3.46                2.98               3.69      318
##  9                    3.46                2.98               3.69      318
## 10                    3.46                2.98               3.69      318
##    CD_count_Cbeebies CD_count_CBBC    CD CD_cbeebies CD_cbbc DomPoS
##                <dbl>         <dbl> <dbl>       <dbl>   <dbl> <chr> 
##  1                 2             7 0.007           0   0.001 name  
##  2                 2             7 0.007           0   0.001 name  
##  3                 2             7 0.007           0   0.001 name  
##  4                 2             7 0.007           0   0.001 name  
##  5                 2             7 0.007           0   0.001 name  
##  6                 2             7 0.007           0   0.001 name  
##  7                 2             7 0.007           0   0.001 name  
##  8                 2             7 0.007           0   0.001 name  
##  9                 2             7 0.007           0   0.001 name  
## 10                 2             7 0.007           0   0.001 name  
##    DomPoSLemma DomPoSFreq DomPoSLemmaTotalFreq AllPoS                    
##    <chr>            <dbl>                <dbl> <chr>                     
##  1 damian             704                  704 .name.adjective.verb.noun.
##  2 damian             704                  704 .name.adjective.verb.noun.
##  3 damian             704                  704 .name.adjective.verb.noun.
##  4 damian             704                  704 .name.adjective.verb.noun.
##  5 damian             704                  704 .name.adjective.verb.noun.
##  6 damian             704                  704 .name.adjective.verb.noun.
##  7 damian             704                  704 .name.adjective.verb.noun.
##  8 damian             704                  704 .name.adjective.verb.noun.
##  9 damian             704                  704 .name.adjective.verb.noun.
## 10 damian             704                  704 .name.adjective.verb.noun.
##    AllPoSFreq    CapitFreq Spell_check Hyphen Homophonic_entry
##    <chr>             <dbl> <chr>        <dbl> <chr>           
##  1 .704.123.3.3.       828 X                0 <NA>            
##  2 .704.123.3.3.       828 X                0 <NA>            
##  3 .704.123.3.3.       828 X                0 <NA>            
##  4 .704.123.3.3.       828 X                0 <NA>            
##  5 .704.123.3.3.       828 X                0 <NA>            
##  6 .704.123.3.3.       828 X                0 <NA>            
##  7 .704.123.3.3.       828 X                0 <NA>            
##  8 .704.123.3.3.       828 X                0 <NA>            
##  9 .704.123.3.3.       828 X                0 <NA>            
## 10 .704.123.3.3.       828 X                0 <NA>            
##    Double_word_entry Freq_double_entry logfreqscaled
##    <chr>             <chr>                     <dbl>
##  1 <NA>              <NA>                      -1.90
##  2 <NA>              <NA>                      -1.90
##  3 <NA>              <NA>                      -1.90
##  4 <NA>              <NA>                      -1.90
##  5 <NA>              <NA>                      -1.90
##  6 <NA>              <NA>                      -1.90
##  7 <NA>              <NA>                      -1.90
##  8 <NA>              <NA>                      -1.90
##  9 <NA>              <NA>                      -1.90
## 10 <NA>              <NA>                      -1.90
## # ℹ 276,682 more rows
# check that words match and are at the right word_pos
print(subset(merged, sent_nr==100) %>% group_by(word) %>% summarise(first(CoreWord), first(wordstripped), first(word_pos)), width=Inf)
## # A tibble: 6 × 4
##   word   `first(CoreWord)` `first(wordstripped)` `first(word_pos)`
##   <chr>  <chr>             <chr>                             <dbl>
## 1 Andrew Andrew            Andrew                                1
## 2 closed closed            closed                                2
## 3 door   door              door                                  5
## 4 office office            office                                4
## 5 out.   out               out                                   9
## 6 way    way               way                                   8
# remove auxiliary columns

merged <- merged %>% select(-c(uniquepos, wordstripped))

# reorder data by sent_nr and word_pos
merged <- merged %>% arrange(exp, subj_nr, sent_nr, word_pos)

We store the results.

write_tsv(merged, "UCL_corpus_with_PMB_and_frequencies.tsv")

We store the same results, but without the frequency information.

merged_small <- merged %>% select(-c(FreqCount:ncol(merged)))
write_tsv(merged_small, "UCL_corpus_with_PMB_without_frequencies.tsv")