Skip to contents

This example demonstrates a few ways to specify comparisons and groups in lingmatch.

Built with R 4.3.1 on October 07 2023


Setup

We’ll generate some word category output, in a sort of experimental design that allows for all available comparison types:

Imagine in two studies we paired up participants, then had them have a series of interactions after reading one of a set of prompts:

# load lingmatch
library(lingmatch)

# first, we have simple representations (function word category use frequencies)
# of our prompts (3 prompts per study):
prompts <- data.frame(
  study = rep(paste("study", 1:2), each = 3),
  prompt = rep(paste("prompt", 1:3), 2),
  matrix(rnorm(3 * 2 * 7, 10, 4), 3 * 2, dimnames = list(NULL, names(lma_dict(1:7))))
)
prompts[1:5, 1:8]
#>     study   prompt     ppron      ipron   article   adverb      conj      prep
#> 1 study 1 prompt 1 19.174226  9.9282870  3.053319 4.688521  7.541644 12.992584
#> 2 study 1 prompt 2 12.282710  4.3561748 14.342092 8.897854 10.237322  8.670206
#> 3 study 1 prompt 3 12.843205 -0.7312331  8.432034 5.745791 13.552502  9.625417
#> 4 study 2 prompt 1 13.786820  4.0464953  9.931430 9.383238 10.094528 15.010402
#> 5 study 2 prompt 2  4.290559 21.1146924  2.499395 7.985822 15.081431 15.603813

# then, the same representation of the language the participants produced:
data <- data.frame(
  study = sort(sample(paste("study", 1:2), 100, TRUE)),
  pair = sort(sample(paste("pair", formatC(1:20, width = 2, flag = 0)), 100, TRUE)),
  prompt = sample(paste("prompt", 1:3), 100, TRUE),
  speaker = sample(c("a", "b"), 100, TRUE),
  matrix(rnorm(100 * 7, 10, 4), 100, dimnames = list(NULL, colnames(prompts)[-(1:2)]))
)
data[1:5, 1:8]
#>     study    pair   prompt speaker     ppron     ipron   article   adverb
#> 1 study 1 pair 01 prompt 2       a  3.976058  6.588169 11.963315 12.41309
#> 2 study 1 pair 01 prompt 3       a  3.756812  2.815671  9.942102  4.17988
#> 3 study 1 pair 01 prompt 1       b  7.152982 11.805938  9.187029 15.55877
#> 4 study 1 pair 01 prompt 3       a  8.456743 10.694663  8.069198 15.08692
#> 5 study 1 pair 01 prompt 2       a 10.781789 13.615539  7.734572 10.52299

Matching with a standard

Sample means

Compare each row (here representing a turn in an conversation) with the sample’s mean:

# the `lsm` (Language Style Matching) type specifies the columns to consider,
# and the metric to use (Canberra similarity)
lsm_mean <- lingmatch(data, mean, type = "lsm")

# look at comparison information
lsm_mean[c("comp.type", "comp")]
#> $comp.type
#> [1] "mean"
#> 
#> $comp
#>     ppron     ipron   article    adverb      conj      prep   auxverb 
#>  9.761108 10.648898  9.777774 10.152384 10.846616  9.598028  9.594665

# and maybe the average similarity score
mean(lsm_mean$sim)
#> [1] 0.8268262

This could be considered a baseline for the sample.

Stored means

These LSM categories have some standard means stored internally, as found in the LIWC manual.

# compare with means from a set of tweets
lsm_twitter <- lingmatch(data, "twitter", type = "lsm")
lsm_twitter[c("comp.type", "comp")]
#> $comp.type
#> [1] "twitter"
#> 
#> $comp
#>         ppron ipron article adverb conj  prep auxverb
#> twitter  9.02   4.6    5.58   5.13 4.19 11.88    8.27
mean(lsm_twitter$sim)
#> [1] 0.7191222

# or the means of the set that is most similar to the current set
lsm_auto <- lingmatch(data, "auto", type = "lsm")
lsm_auto[c("comp.type", "comp")]
#> $comp.type
#> [1] "auto: nytimes"
#> 
#> $comp
#>         ppron ipron article adverb conj  prep auxverb
#> nytimes  3.56  3.84    9.08   2.76 4.85 14.27    5.11
mean(lsm_auto$sim)
#> [1] 0.6382761

External means

If you have another set of data, you can also use its means as the comparison:

lsm_prmed <- lingmatch(data, colMeans(prompts[, -(1:2)]), type = "lsm")
lsm_prmed[c("comp.type", "comp")]
#> $comp.type
#> [1] "colMeans(prompts[, -(1:2)])"
#> 
#> $comp
#>     ppron     ipron   article    adverb      conj      prep   auxverb 
#> 12.374987  7.727783  8.215227  6.946889 12.024881 12.102464  7.386018
mean(lsm_prmed$sim)
#> [1] 0.7966027

Group means

You can also compare to means within groups. Here, studies might be considered groups:

lsm_topics <- lingmatch(data, group = study, type = "lsm")
lsm_topics[c("comp.type", "comp")]
#> $comp.type
#> [1] "study group mean"
#> 
#> $comp
#>            ppron    ipron   article    adverb     conj      prep   auxverb
#> study 1  8.93558 10.57181  9.065483 10.800671 10.51655 10.479439 10.050747
#> study 2 10.43654 10.71197 10.360558  9.621968 11.11667  8.876874  9.221507
tapply(lsm_topics$sim[, 2], lsm_topics$sim[, 1], mean)
#>   study 1   study 2 
#> 0.8254038 0.8317847

This type of group variable is just splitting the data, and performing the same comparisons within splits.

Matching with other texts

The previous comparisons were all with standards, where the LSM score could be interpreted as indicating a more or less generic language style (as defined by the comparison and grouping).

Condition ID

Here, prompts constitute our experimental conditions. We have 3 unique prompt IDs, but 6 unique prompts, since each study had its own set, so we need the study and prompt ID to appropriately match prompts:

lsm <- lingmatch(data, prompts, group = c("study", "prompt"), type = "lsm")
lsm$comp.type
#> [1] "prompts"
lsm$comp[, 1:6]
#>                      ppron      ipron   article   adverb      conj      prep
#> study 1 prompt 1 19.174226  9.9282870  3.053319 4.688521  7.541644 12.992584
#> study 1 prompt 2 12.282710  4.3561748 14.342092 8.897854 10.237322  8.670206
#> study 1 prompt 3 12.843205 -0.7312331  8.432034 5.745791 13.552502  9.625417
#> study 2 prompt 1 13.786820  4.0464953  9.931430 9.383238 10.094528 15.010402
#> study 2 prompt 2  4.290559 21.1146924  2.499395 7.985822 15.081431 15.603813
#> study 2 prompt 3 11.872403  7.6522825 11.033090 4.980109 15.641861 10.712361
lsm$sim[1:10, ]
#>                  g1  canberra
#> 1  study 1 prompt 2 0.6419421
#> 2  study 1 prompt 3 0.7093420
#> 3  study 1 prompt 1 0.7029009
#> 4  study 1 prompt 3 0.5716712
#> 5  study 1 prompt 2 0.8243401
#> 6  study 1 prompt 1 0.5881550
#> 7  study 1 prompt 3 0.6266525
#> 8  study 1 prompt 1 0.7031869
#> 9  study 1 prompt 1 0.7381836
#> 10 study 1 prompt 1 0.6981039

Here, the group argument is just pasting together the included variables, and using the resulting string to identify a single comparison for each text (acting as a condition ID).

Participant ID

Similarly, participants are only uniquely identified by pair ID and speaker ID (though this could just as well be a single column with unique IDs).

interlsm <- lingmatch(data, group = c("pair", "speaker"), type = "lsm")
interlsm$comp[1:10, ]
#>               ppron     ipron   article    adverb      conj      prep   auxverb
#> pair 01 a  6.742851  8.428510  9.427297 10.550722  7.424471  8.638640  7.134622
#> pair 01 b  5.533245 14.644290  8.996506 13.065131 12.923018 10.501889  8.476857
#> pair 02 b 12.034875  9.361364  7.269673 12.549221 10.227551 13.095727 11.104860
#> pair 02 a  8.555251  8.492119  5.522736 11.366566  9.312478 13.910881 14.826184
#> pair 03 a 10.031531  9.361374 12.110378  9.090354 10.729273 11.986164 10.389608
#> pair 03 b 14.077289  7.681167 12.910905  8.874849  8.830308  9.296559 12.693680
#> pair 04 a  8.689068 10.371357 10.918580 11.955403 10.511910 10.724440 12.159187
#> pair 04 b  8.396325 12.652042  9.884445 15.114406 12.223511  7.618507  7.252497
#> pair 05 a  9.065626 13.873713  7.056387 11.720066 11.352245 11.251204  9.223638
#> pair 05 b  8.898625  9.254042 12.800030 11.919600 13.774630  9.500910 10.694043
interlsm$sim[1:10, ]
#>           g1  canberra
#> 1  pair 01 a 0.7359023
#> 2  pair 01 a 0.7197391
#> 3  pair 01 b 0.9035415
#> 4  pair 01 a 0.8347322
#> 5  pair 01 a 0.8306081
#> 6  pair 01 b 0.8815873
#> 7  pair 02 b 0.8932422
#> 8  pair 02 a 0.8821426
#> 9  pair 02 b 0.9302894
#> 10 pair 02 a 0.8812338

Matching in sequence

Since participants are having interactions in sequence, we might compare each turn in sequence. The last entry in the group argument specifies the speaker:

seqlsm <- lingmatch(data, "seq", group = c("pair", "speaker"), type = "lsm")
seqlsm$sim[1:10, ]
#>                     group  canberra
#> 1, 2 <-> 3        pair 01 0.6657941
#> 3 <-> 4, 5        pair 01 0.8613678
#> 4, 5 <-> 6        pair 01 0.7761998
#> 7 <-> 8           pair 02 0.8979219
#> 8 <-> 9           pair 02 0.8777935
#> 9 <-> 10          pair 02 0.7712818
#> 10 <-> 11         pair 02 0.8209108
#> 12, 13, 14 <-> 15 pair 03 0.9094760
#> 16, 17 <-> 18, 19 pair 04 0.7996544
#> 18, 19 <-> 20, 21 pair 04 0.7735631

The rownames of sim show the row numbers that are being compared, with some being aggregated if the same speaker takes multiple turns in a row. You could also just compare edges by adding agg = FALSE:

lingmatch(
  data, "seq",
  group = c("pair", "speaker"), type = "lsm", agg = FALSE
)$sim[1:10, ]
#>             group  canberra
#> 2 <-> 3   pair 01 0.6547178
#> 3 <-> 4   pair 01 0.8739750
#> 5 <-> 6   pair 01 0.8130460
#> 7 <-> 8   pair 02 0.8979219
#> 8 <-> 9   pair 02 0.8777935
#> 9 <-> 10  pair 02 0.7712818
#> 10 <-> 11 pair 02 0.8209108
#> 14 <-> 15 pair 03 0.7657990
#> 17 <-> 18 pair 04 0.6965492
#> 19 <-> 20 pair 04 0.7082835

Brought to you by the Language Use and Social Interaction lab at Texas Tech University
Brought to you by the Language Use and Social Interaction lab at Texas Tech University