Skip to contents

This example demonstrates a few ways to specify comparisons and groups in lingmatch.

Built with R 4.2.2 on January 22 2023


Setup

We’ll generate some word category output, in a sort of experimental design that allows for all available comparison types:

Imagine in two studies we paired up participants, then had them have a series of interactions after reading one of a set of prompts:

# load lingmatch
library("lingmatch")

# first, we have simple representations (function word category use frequencies)
# of our prompts (3 prompts per study):
prompts <- data.frame(
  study = rep(paste("study", 1:2), each = 3),
  prompt = rep(paste("prompt", 1:3), 2),
  matrix(rnorm(3 * 2 * 7, 10, 4), 3 * 2, dimnames = list(NULL, names(lma_dict(1:7))))
)
prompts[1:5, 1:8]
#>     study   prompt     ppron     ipron   article    adverb      conj      prep
#> 1 study 1 prompt 1  6.864407 13.859885  6.938533  9.984956 12.977499 11.781433
#> 2 study 1 prompt 2 12.113190  8.793740  9.987835  3.679203  6.488306  9.610828
#> 3 study 1 prompt 3  4.906687 12.534891  7.852603  8.653199 13.482843  7.836163
#> 4 study 2 prompt 1  7.747281  3.858712 10.086379 10.800065  9.719417 11.361967
#> 5 study 2 prompt 2  5.976320 10.213058  4.272967 13.107793  5.597550  9.089050

# then, the same representation of the language the participants produced:
data <- data.frame(
  study = sort(sample(paste("study", 1:2), 100, TRUE)),
  pair = sort(sample(paste("pair", formatC(1:20, width = 2, flag = 0)), 100, TRUE)),
  prompt = sample(paste("prompt", 1:3), 100, TRUE),
  speaker = sample(c("a", "b"), 100, TRUE),
  matrix(rnorm(100 * 7, 10, 4), 100, dimnames = list(NULL, colnames(prompts)[-(1:2)]))
)
data[1:5, 1:8]
#>     study    pair   prompt speaker      ppron     ipron   article    adverb
#> 1 study 1 pair 01 prompt 1       b  0.7020615 10.491647  3.779056 -2.757235
#> 2 study 1 pair 01 prompt 2       a 11.3947600  8.548593 10.834191  7.965312
#> 3 study 1 pair 01 prompt 1       a 10.9304932  7.945934  9.669309  5.248526
#> 4 study 1 pair 01 prompt 3       a 10.7997755 17.222643 10.715310  8.092856
#> 5 study 1 pair 01 prompt 1       a 12.4622082  7.465270 14.939144  8.013883

Matching with a standard

Sample means

Compare each row (here representing a turn in an conversation) with the sample’s mean:

# the `lsm` (Language Style Matching) type specifies the columns to consider,
# and the metric to use (Canberra similarity)
lsm_mean <- lingmatch(data, mean, type = "lsm")

# look at comparison information
lsm_mean[c("comp.type", "comp")]
#> $comp.type
#> [1] "mean"
#> 
#> $comp
#>     ppron     ipron   article    adverb      conj      prep   auxverb 
#> 10.079219 10.416393  9.645455  8.946775 10.435997 10.173685  9.635552

# and maybe the average similarity score
mean(lsm_mean$sim)
#> [1] 0.8312954

This could be considered a baseline for the sample.

Stored means

These LSM categories have some standard means stored internally, as found in the LIWC manual.

# compare with means from a set of tweets
lsm_twitter <- lingmatch(data, "twitter", type = "lsm")
lsm_twitter[c("comp.type", "comp")]
#> $comp.type
#> [1] "twitter"
#> 
#> $comp
#>         ppron ipron article adverb conj  prep auxverb
#> twitter  9.02   4.6    5.58   5.13 4.19 11.88    8.27
mean(lsm_twitter$sim)
#> [1] 0.7237678

# or the means of the set that is most similar to the current set
lsm_auto <- lingmatch(data, "auto", type = "lsm")
lsm_auto[c("comp.type", "comp")]
#> $comp.type
#> [1] "auto: novels"
#> 
#> $comp
#>        ppron ipron article adverb conj  prep auxverb
#> novels 10.35  4.79    8.35   4.17 6.28 14.27    7.77
mean(lsm_auto$sim)
#> [1] 0.7470567

External means

If you have another set of data, you can also use its means as the comparison:

lsm_prmed <- lingmatch(data, colMeans(prompts[, -(1:2)]), type = "lsm")
lsm_prmed[c("comp.type", "comp")]
#> $comp.type
#> [1] "colMeans(prompts[, -(1:2)])"
#> 
#> $comp
#>    ppron    ipron  article   adverb     conj     prep  auxverb 
#> 7.642559 8.651467 7.688486 9.920160 8.655075 9.818750 9.434635
mean(lsm_prmed$sim)
#> [1] 0.8166922

Group means

You can also compare to means within groups. Here, studies might be considered groups:

lsm_topics <- lingmatch(data, group = study, type = "lsm")
lsm_topics[c("comp.type", "comp")]
#> $comp.type
#> [1] "study group mean"
#> 
#> $comp
#>             ppron    ipron  article   adverb      conj      prep  auxverb
#> study 1  9.970332 10.67027 9.406745 8.793292  9.726499  9.769428 9.298925
#> study 2 10.183835 10.17247 9.874803 9.094240 11.117671 10.562090 9.958977
tapply(lsm_topics$sim[, 2], lsm_topics$sim[, 1], mean)
#>   study 1   study 2 
#> 0.8177467 0.8449237

This type of group variable is just splitting the data, and performing the same comparisons within splits.

Matching with other texts

The previous comparisons were all with standards, where the LSM score could be interpreted as indicating a more or less generic language style (as defined by the comparison and grouping).

Condition ID

Here, prompts constitute our experimental conditions. We have 3 unique prompt IDs, but 6 unique prompts, since each study had its own set, so we need the study and prompt ID to appropriately match prompts:

lsm <- lingmatch(data, prompts, group = c("study", "prompt"), type = "lsm")
lsm$comp.type
#> [1] "prompts"
lsm$comp[, 1:6]
#>                      ppron     ipron   article    adverb      conj      prep
#> study 1 prompt 1  6.864407 13.859885  6.938533  9.984956 12.977499 11.781433
#> study 1 prompt 2 12.113190  8.793740  9.987835  3.679203  6.488306  9.610828
#> study 1 prompt 3  4.906687 12.534891  7.852603  8.653199 13.482843  7.836163
#> study 2 prompt 1  7.747281  3.858712 10.086379 10.800065  9.719417 11.361967
#> study 2 prompt 2  5.976320 10.213058  4.272967 13.107793  5.597550  9.089050
#> study 2 prompt 3  8.247466  2.648515  6.992599 13.295743  3.664837  9.233055
lsm$sim[1:10, ]
#>                  g1  canberra
#> 1  study 1 prompt 1 0.4196806
#> 2  study 1 prompt 2 0.7421996
#> 3  study 1 prompt 1 0.7849499
#> 4  study 1 prompt 3 0.7358929
#> 5  study 1 prompt 1 0.7092547
#> 6  study 1 prompt 3 0.7825726
#> 7  study 1 prompt 2 0.7275835
#> 8  study 1 prompt 2 0.7250848
#> 9  study 1 prompt 3 0.6666420
#> 10 study 1 prompt 1 0.7949822

Here, the group argument is just pasting together the included variables, and using the resulting string to identify a single comparison for each text (acting as a condition ID).

Participant ID

Similarly, participants are only uniquely identified by pair ID and speaker ID (though this could just as well be a single column with unique IDs).

interlsm <- lingmatch(data, group = c("pair", "speaker"), type = "lsm")
interlsm$comp[1:10, ]
#>                ppron     ipron   article    adverb      conj      prep
#> pair 01 b  0.7020615 10.491647  3.779056 -2.757235  4.742780  5.672667
#> pair 01 a 11.3968092 10.295610 11.539489  7.330144  6.592874 10.141503
#> pair 02 b  9.6657169  9.840502 10.462939  7.642948 11.012827  9.891918
#> pair 03 a 11.9535745 12.435491  6.445858 12.740861 12.616633  6.268792
#> pair 03 b 10.2310641 10.529372 10.019987  8.087845  8.769773 10.779999
#> pair 04 b 11.3083514  5.863618 11.835957  7.323427 13.502936 13.973873
#> pair 05 a  9.6600465 11.391218  9.662540  8.387847  9.966338  9.799975
#> pair 05 b 10.4133382  9.882321 12.424708 13.537444 12.087927 10.672846
#> pair 06 a 17.1951134  9.296324  4.053924 13.062216 14.760465  5.377913
#> pair 06 b 10.1448893  7.680004  7.372897  8.449567  5.986017  7.971165
#>              auxverb
#> pair 01 b -0.2898078
#> pair 01 a 15.0987578
#> pair 02 b  9.7237697
#> pair 03 a 11.5777332
#> pair 03 b  9.5540010
#> pair 04 b 10.5615227
#> pair 05 a  6.7583120
#> pair 05 b 11.2632321
#> pair 06 a 12.5754275
#> pair 06 b 17.8446326
interlsm$sim[1:10, ]
#>           g1  canberra
#> 1  pair 01 b 1.0000000
#> 2  pair 01 a 0.8424788
#> 3  pair 01 a 0.8776799
#> 4  pair 01 a 0.8940391
#> 5  pair 01 a 0.8750118
#> 6  pair 02 b 0.7509222
#> 7  pair 02 b 0.7398585
#> 8  pair 02 b 0.8375352
#> 9  pair 02 b 0.8351512
#> 10 pair 03 a 0.8812831

Matching in sequence

Since participants are having interactions in sequence, we might compare each turn in sequence. The last entry in the group argument specifies the speaker:

seqlsm <- lingmatch(data, "seq", group = c("pair", "speaker"), type = "lsm")
seqlsm$sim[1:10, ]
#>                             group  canberra
#> 1 <-> 2, 3, 4, 5          pair 01 0.4506036
#> 6, 7, 8, 9                pair 02 1.0000000
#> 10 <-> 11                 pair 03 0.7689010
#> 11 <-> 12                 pair 03 0.7746189
#> 12 <-> 13, 14, 15, 16     pair 03 0.8756456
#> 17, 18                    pair 04 1.0000000
#> 19, 20, 21, 22 <-> 23, 24 pair 05 0.8885610
#> 23, 24 <-> 25             pair 05 0.8024998
#> 26, 27 <-> 28             pair 06 0.7645176
#> 29, 30 <-> 31             pair 07 0.8678496

The rownames of sim show the row numbers that are being compared, with some being aggregated if the same speaker takes multiple turns in a row. You could also just compare edges by adding agg = FALSE:

lingmatch(
  data, "seq",
  group = c("pair", "speaker"), type = "lsm", agg = FALSE
)$sim[1:10, ]
#>              group  canberra
#> 1 <-> 2    pair 01 0.4455924
#> 6, 7, 8, 9 pair 02 1.0000000
#> 10 <-> 11  pair 03 0.7689010
#> 11 <-> 12  pair 03 0.7746189
#> 12 <-> 13  pair 03 0.8514062
#> 17, 18     pair 04 1.0000000
#> 22 <-> 23  pair 05 0.8166640
#> 24 <-> 25  pair 05 0.7569052
#> 27 <-> 28  pair 06 0.6722367
#> 30 <-> 31  pair 07 0.7898381

Brought to you by the Language Use and Social Interaction lab at Texas Tech University