1 |
#' Document-Term Matrix Categorization |
|
2 |
#' |
|
3 |
#' Reduces the dimensions of a document-term matrix by dictionary-based categorization. |
|
4 |
#' @param dtm A matrix with terms as column names. |
|
5 |
#' @param dict The name of a provided dictionary |
|
6 |
#' (\href{https://osf.io/y6g5b/wiki/home}{osf.io/y6g5b/wiki}) or of a file found in |
|
7 |
#' \code{dir}, or a \code{list} object with named character vectors as word lists, |
|
8 |
#' or the path to a file to be read in by \code{\link{read.dic}}. |
|
9 |
#' @param term.weights A \code{list} object with named numeric vectors lining up with the character |
|
10 |
#' vectors in \code{dict}, used to weight the terms in each \code{dict} vector. If a category in |
|
11 |
#' \code{dict} is not specified in \code{term.weights}, or the \code{dict} and \code{term.weights} |
|
12 |
#' vectors aren't the same length, the weights for that category will be 1. |
|
13 |
#' @param bias A list or named vector specifying a constant to add to the named category. If a term |
|
14 |
#' matching \code{bias.name} is included in a category, it's associated \code{weight} will be used |
|
15 |
#' as the \code{bias} for that category. |
|
16 |
#' @param bias.name A character specifying a term to be used as a category bias; default is |
|
17 |
#' \code{'_intercept'}. |
|
18 |
#' @param escape Logical indicating whether the terms in \code{dict} should not be treated as plain |
|
19 |
#' text (including asterisk wild cards). If \code{TRUE}, regular expression related characters are |
|
20 |
#' escaped. Set to \code{TRUE} if you get PCRE compilation errors. |
|
21 |
#' @param partial Logical; if \code{TRUE} terms are partially matched (not padded by ^ and $). |
|
22 |
#' @param glob Logical; if \code{TRUE} (default), will convert initial and terminal asterisks to |
|
23 |
#' partial matches. |
|
24 |
#' @param to.lower Logical; if \code{TRUE} will lowercase dictionary terms. Otherwise, dictionary |
|
25 |
#' terms will be converted to match the terms if they are single-cased. Set to \code{FALSE} to |
|
26 |
#' always keep dictionary terms as entered. |
|
27 |
#' @param term.filter A regular expression string used to format the text of each term (passed to |
|
28 |
#' \code{gsub}). For example, if terms are part-of-speech tagged (e.g., |
|
29 |
#' \code{'a_DT'}), \code{'_.*'} would remove the tag. |
|
30 |
#' @param term.break If a category has more than \code{term.break} characters, it will be processed |
|
31 |
#' in chunks. Reduce from 20000 if you get a PCRE compilation error. |
|
32 |
#' @param dir Path to a folder in which to look for \code{dict}; \cr |
|
33 |
#' will look in \code{'~/Dictionaries'} by default. \cr |
|
34 |
#' Set a session default with \code{options(lingmatch.dict.dir = 'desired/path')}. |
|
35 |
#' @param coverage Logical; if \code{TRUE}, will calculate coverage |
|
36 |
#' (number of unique term matches) for each category. |
|
37 |
#' @seealso For applying pattern-based dictionaries (to raw text) see \code{\link{lma_patcat}}. |
|
38 |
#' @family Dictionary functions |
|
39 |
#' @return A matrix with a row per \code{dtm} row and columns per dictionary category |
|
40 |
#' (with added \code{coverage_} versions if \code{coverage} is \code{TRUE}), |
|
41 |
#' and a \code{WC} attribute with original word counts. |
|
42 |
#' @examples |
|
43 |
#' \dontrun{ |
|
44 |
#' |
|
45 |
#' # Score texts with the NRC Affect Intensity Lexicon |
|
46 |
#' |
|
47 |
#' dict <- readLines("https://saifmohammad.com/WebDocs/NRC-AffectIntensity-Lexicon.txt") |
|
48 |
#' dict <- read.table( |
|
49 |
#' text = dict[-seq_len(grep("term\tscore", dict, fixed = TRUE)[[1]])], |
|
50 |
#' col.names = c("term", "weight", "category") |
|
51 |
#' ) |
|
52 |
#' |
|
53 |
#' text <- c( |
|
54 |
#' angry = paste( |
|
55 |
#' "We are outraged by their hateful brutality,", |
|
56 |
#' "and by the way they terrorize us with their hatred." |
|
57 |
#' ), |
|
58 |
#' fearful = paste( |
|
59 |
#' "The horrific torture of that terrorist was tantamount", |
|
60 |
#' "to the terrorism of terrorists." |
|
61 |
#' ), |
|
62 |
#' joyous = "I am jubilant to be celebrating the bliss of this happiest happiness.", |
|
63 |
#' sad = paste( |
|
64 |
#' "They are nearly suicidal in their mourning after", |
|
65 |
#' "the tragic and heartbreaking holocaust." |
|
66 |
#' ) |
|
67 |
#' ) |
|
68 |
#' |
|
69 |
#' emotion_scores <- lma_termcat(text, dict) |
|
70 |
#' if (require("splot")) splot(emotion_scores ~ names(text), leg = "out") |
|
71 |
#' |
|
72 |
#' ## or use the standardized version (which includes more categories) |
|
73 |
#' |
|
74 |
#' emotion_scores <- lma_termcat(text, "nrc_eil", dir = "~/Dictionaries") |
|
75 |
#' emotion_scores <- emotion_scores[, c("anger", "fear", "joy", "sadness")] |
|
76 |
#' if (require("splot")) splot(emotion_scores ~ names(text), leg = "out") |
|
77 |
#' } |
|
78 |
#' @export |
|
79 | ||
80 |
lma_termcat <- function(dtm, dict, term.weights = NULL, bias = NULL, bias.name = "_intercept", |
|
81 |
escape = TRUE, partial = FALSE, glob = TRUE, term.filter = NULL, term.break = 2e4, |
|
82 |
to.lower = FALSE, dir = getOption("lingmatch.dict.dir"), coverage = FALSE) { |
|
83 | 94x |
st <- proc.time()[[3]] |
84 | 94x |
if (ckd <- dir == "") dir <- "~/Dictionaries" |
85 | 10x |
if (missing(dict)) dict <- lma_dict(1:9) |
86 | ! |
if (is.factor(dict)) dict <- as.character(dict) |
87 | 94x |
if (is.character(dict) && length(dict) == 1 && missing(term.weights) && (file.exists(dict) || !grepl("\\s", dict))) { |
88 | ! |
if (!file.exists(dict) && any(file.exists(normalizePath(paste0(dir, "/", dict), "/", FALSE)))) { |
89 | ! |
dict <- normalizePath(paste0(dir, "/", dict)) |
90 |
} |
|
91 | ! |
td <- tryCatch(read.dic(dict, dir = if (ckd) "" else dir), error = function(e) NULL) |
92 | ! |
dict <- if (is.null(td)) list(cat1 = dict) else td |
93 |
} |
|
94 | 94x |
if (!is.null(dim(dict))) { |
95 | 11x |
if (!is.null(term.weights)) { |
96 | 2x |
if (is.character(term.weights) && any(su <- term.weights %in% colnames(dict))) { |
97 | 2x |
term.weights <- dict[, term.weights[su], drop = FALSE] |
98 |
} |
|
99 | 2x |
if (!is.null(dim(term.weights))) { |
100 | 2x |
term.weights <- term.weights[, vapply( |
101 | 2x |
seq_len(ncol(term.weights)), |
102 | 2x |
function(col) is.numeric(term.weights[, col]), TRUE |
103 |
)] |
|
104 |
} |
|
105 | 9x |
} else if (any(su <- vapply(seq_len(ncol(dict)), function(col) is.numeric(dict[, col]), TRUE))) { |
106 | 9x |
term.weights <- dict[, su, drop = FALSE] |
107 | 9x |
dict <- if (all(su)) { |
108 | 2x |
if (!is.null(rownames(dict))) { |
109 | 2x |
data.frame(term = rownames(dict), stringsAsFactors = FALSE) |
110 |
} else { |
|
111 | ! |
term.weights <- if (ncol(term.weights) == 1) NULL else term.weights[, -1, drop = FALSE] |
112 | ! |
dict[, 1, drop = FALSE] |
113 |
} |
|
114 |
} else { |
|
115 | 7x |
dict[, !su, drop = FALSE] |
116 |
} |
|
117 |
} |
|
118 | 11x |
if (!is.null(rownames(dict)) && ncol(dict) == 1 && any(grepl("^[a-z]", rownames(dict), TRUE))) { |
119 | ! |
dict <- rownames(dict) |
120 |
} else { |
|
121 | 11x |
su <- vapply(seq_len(ncol(dict)), function(col) !is.numeric(dict[, col]), TRUE) |
122 | ! |
if (!any(su)) stop("no terms found in dictionary") |
123 | 11x |
dict <- if (sum(su) > 1) { |
124 | 4x |
su <- which(su) |
125 | 4x |
if (!is.null(term.weights) && (!is.list(term.weights) || ncol(term.weights) == 1)) { |
126 | 3x |
if (is.list(term.weights)) term.weights <- term.weights[, 1] |
127 | 3x |
ssu <- vapply(su, function(col) length(unique(dict[, col])), 0) + seq(length(su), 1) |
128 | 3x |
term.weights <- split(term.weights, dict[, which.min(ssu)]) |
129 | 3x |
dict <- split(dict[, which.max(ssu)], dict[, which.min(ssu)]) |
130 |
} else { |
|
131 | 1x |
ssu <- vapply(su, function(col) anyDuplicated(dict[, col]) == 0, TRUE) |
132 | ! |
if (any(ssu)) dict[, su[ssu][1]] else dict[, su[1]] |
133 |
} |
|
134 |
} else { |
|
135 | 7x |
dict[, su] |
136 |
} |
|
137 |
} |
|
138 |
} |
|
139 | 94x |
if (is.numeric(dict) && is.null(term.weights)) { |
140 | 3x |
term.weights <- dict |
141 | 3x |
dict <- names(dict) |
142 |
} |
|
143 | ! |
if (is.factor(dict)) dict <- as.character(dict) |
144 | 94x |
if (!is.null(dim(term.weights))) { |
145 | 1x |
if (is.null(colnames(term.weights))) colnames(term.weights) <- if (length(dict) == length(term.weights)) names(dict) else paste0("cat", seq_len(ncol(term.weights))) |
146 | 1x |
if (!is.data.frame(term.weights)) term.weights <- as.data.frame(term.weights, stringsAsFactors = FALSE) |
147 | 10x |
su <- vapply(term.weights, is.numeric, TRUE) |
148 | 10x |
if (any(!su)) { |
149 | 1x |
if (any(ssu <- !su & vapply(term.weights, anyDuplicated, 0) == 0)) { |
150 | 1x |
rownames(term.weights) <- term.weights[, which(ssu)[1]] |
151 |
} |
|
152 | 1x |
term.weights <- term.weights[, su] |
153 |
} |
|
154 | ! |
if (!length(term.weights)) stop("no numeric columns in term.weights") |
155 |
} |
|
156 | 94x |
if (!is.list(dict)) { |
157 | 32x |
dict <- if (is.matrix(dict)) { |
158 | ! |
as.data.frame(dict, stringsAsFactors = FALSE) |
159 | 32x |
} else if (is.character(dict) && length(dict) == 1 && (file.exists(dict) || dict %in% rownames(select.dict()$info))) { |
160 | ! |
read.dic(dict, dir = if (ckd) "" else dir) |
161 |
} else { |
|
162 | 32x |
list(dict) |
163 |
} |
|
164 |
} |
|
165 | 94x |
if (is.list(dict)) { |
166 | 94x |
if (is.null(names(dict))) { |
167 | 44x |
tn <- if (!is.null(colnames(term.weights))) colnames(term.weights) else names(term.weights) |
168 | 44x |
names(dict) <- if (!is.null(tn) && length(tn) == length(dict)) tn else paste0("cat", seq_along(dict)) |
169 | 50x |
} else if (any(su <- names(dict) == "")) { |
170 | 2x |
names(dict)[su] <- if (sum(su) == 1) "cat_unnamed" else paste0("cat_unnamed", seq_len(sum(su))) |
171 | 2x |
if (!is.null(term.weights) && any(su <- names(term.weights) == "")) { |
172 | 2x |
names(term.weights)[su] <- if (sum(su) == 1) "cat_unnamed" else paste0("cat_unnamed", seq_len(sum(su))) |
173 |
} |
|
174 |
} |
|
175 |
} else { |
|
176 | ! |
dict <- list(dict) |
177 |
} |
|
178 | 94x |
if (!is.null(term.weights)) { |
179 | 28x |
if (is.null(dim(term.weights))) { |
180 | 18x |
if (is.list(term.weights)) { |
181 | ! |
if (length(dict) != length(term.weights) && !is.null(names(term.weights[[1]]))) dict <- term.weights |
182 | 8x |
if (length(dict) == length(term.weights) && !all(names(dict) %in% names(term.weights))) { |
183 | 1x |
if (is.null(names(term.weights)) || !any(names(term.weights) %in% names(dict))) { |
184 | 1x |
names(term.weights) <- names(dict) |
185 |
} else { |
|
186 | ! |
for (cat in names(dict)[!names(dict) %in% names(term.weights)]) { |
187 | ! |
term.weights[cat] <- structure(rep(1, length(dict[[cat]])), names = dict[[cat]]) |
188 |
} |
|
189 |
} |
|
190 |
} |
|
191 | 8x |
for (cat in names(dict)) { |
192 | 16x |
if (is.null(names(term.weights[[cat]]))) { |
193 | 14x |
if (length(term.weights[[cat]]) == length(dict[[cat]])) { |
194 | 14x |
names(term.weights[[cat]]) <- dict[[cat]] |
195 |
} else { |
|
196 | ! |
term.weights[[cat]] <- structure(rep(1, length(dict[[cat]])), names = dict[[cat]]) |
197 |
} |
|
198 |
} |
|
199 |
} |
|
200 |
} else { |
|
201 | 10x |
if (is.null(names(term.weights))) { |
202 | 7x |
if (length(dict[[1]]) == length(term.weights)) { |
203 | 7x |
term.weights <- list(term.weights) |
204 | 7x |
names(term.weights) <- names(dict) |
205 | 7x |
names(term.weights[[1]]) <- dict[[1]] |
206 |
} else { |
|
207 | ! |
term.weights <- NULL |
208 | ! |
warning("term.weights were dropped as they could not be aligned with dict") |
209 |
} |
|
210 |
} |
|
211 |
} |
|
212 |
} else { |
|
213 | 10x |
if (length(dict) == 1 && length(dict[[1]]) == nrow(term.weights) && |
214 | 10x |
!any(grepl("[a-z]", rownames(term.weights), TRUE))) { |
215 | ! |
if (is.factor(dict[[1]])) dict[[1]] <- as.character(dict[[1]]) |
216 | 7x |
if (anyDuplicated(dict[[1]])) { |
217 | 1x |
dt <- unique(dict[[1]][duplicated(dict[[1]])]) |
218 | 1x |
su <- dict[[1]] %in% dt |
219 | 1x |
td <- term.weights[su, ] |
220 | 1x |
tw <- matrix(0, length(dt), ncol(term.weights), dimnames = list(dt, colnames(term.weights))) |
221 | 1x |
for (term in dt) tw[term, ] <- colMeans(term.weights[dict[[1]] == term, , drop = FALSE], na.rm = TRUE) |
222 | 1x |
term.weights <- rbind(term.weights[!su, ], tw) |
223 | 1x |
rownames(term.weights) <- c(dict[[1]][!su], dt) |
224 | 1x |
dict[[1]] <- rownames(term.weights) |
225 |
} else { |
|
226 | 6x |
rownames(term.weights) <- dict[[1]] |
227 |
} |
|
228 |
} |
|
229 |
} |
|
230 | 28x |
if (!is.null(term.weights)) { |
231 | 3x |
if (!is.list(term.weights)) term.weights <- list(term.weights) |
232 | 28x |
dlen <- length(dict) |
233 | 28x |
if (is.null(names(term.weights))) { |
234 | 3x |
names(term.weights) <- if (length(term.weights) == dlen) names(dict) else seq_along(term.weights) |
235 |
} |
|
236 | 28x |
if (length(term.weights) > dlen && dlen == 1 && all(vapply(term.weights, length, 0) == length(dict[[1]]))) { |
237 | 10x |
dict <- lapply(term.weights, function(ws) dict[[1]]) |
238 |
} |
|
239 |
} |
|
240 |
} |
|
241 | 94x |
dict <- lapply(dict, function(cat) { |
242 | 404x |
if (!is.character(cat)) { |
243 | 2x |
if (is.null(names(cat))) as.character(cat) else names(cat) |
244 |
} else { |
|
245 | 402x |
cat |
246 |
} |
|
247 |
}) |
|
248 | 94x |
if (!is.null(bias) && is.null(names(bias))) { |
249 | 3x |
names(bias) <- if (length(bias) == length(dict)) names(dict) else seq_along(bias) |
250 |
} |
|
251 | 28x |
if (!is.null(names(term.weights)) && length(names(term.weights)) == length(dict)) names(dict) <- names(term.weights) |
252 | 94x |
for (n in names(dict)) { |
253 | 404x |
if (!n %in% names(bias) && any(ii <- !is.na(dict[[n]]) & dict[[n]] == bias.name)) { |
254 | 16x |
bias[n] <- term.weights[[n]][ii] |
255 | 16x |
term.weights[[n]][ii] <- 0 |
256 |
} |
|
257 |
} |
|
258 | 94x |
dict_chars <- list( |
259 | 94x |
all = paste(unique(strsplit(paste0(unique(unlist(dict, use.names = FALSE)), collapse = ""), "")[[1]]), |
260 | 94x |
collapse = "" |
261 |
) |
|
262 |
) |
|
263 | 94x |
dict_chars$alpha <- gsub("[^A-Za-z]", "", dict_chars$all) |
264 | 94x |
dict_chars$case <- if (grepl("[A-Z]", dict_chars$alpha)) { |
265 | 1x |
if (grepl("[a-z]", dict_chars$alpha)) "mixed" else "upper" |
266 |
} else { |
|
267 | 89x |
"lower" |
268 |
} |
|
269 | 94x |
edtm <- substitute(dtm) |
270 | 1x |
if (is.factor(dtm)) dtm <- as.character(dtm) |
271 | 94x |
if (is.character(dtm) || !any(grepl("\\s", colnames(dtm)))) { |
272 | 94x |
if (any(grepl("\\s", unlist(dict, use.names = FALSE)))) { |
273 | 3x |
if (is.character(dtm)) { |
274 | 3x |
warning( |
275 | 3x |
"dict has terms with spaces, so using lma_patcat instead;", |
276 | 3x |
"\n enter a dtm (e.g., lma_dtm(", paste0(edtm, collapse = ""), ")) to force lma_termcat use" |
277 |
) |
|
278 | 3x |
args <- list(text = dtm, dict = dict) |
279 | 1x |
if (!is.null(term.weights)) args$pattern.weights <- term.weights |
280 | 1x |
if (!is.null(bias)) args$bias <- bias |
281 | 1x |
if (!missing(glob)) args$globtoregex <- glob |
282 | 1x |
if (!missing(partial) && !partial) args$boundary <- "\\b" |
283 | 3x |
if (!missing(dir)) args$dir <- if (ckd) "" else dir |
284 | 3x |
return(do.call(lma_patcat, args)) |
285 |
} |
|
286 |
} |
|
287 | 91x |
if (is.character(dtm)) { |
288 | 1x |
if (dict_chars$case == "upper") dtm <- toupper(dtm) |
289 | 10x |
dtm <- lma_dtm(dtm, |
290 | 10x |
numbers = grepl("[0-9]", dict_chars$all), punct = grepl('[_/\\?!."-]', dict_chars$all), |
291 | 10x |
to.lower = dict_chars$case == "lower" |
292 |
) |
|
293 |
} |
|
294 |
} |
|
295 | ! |
if (is.null(dim(dtm))) dtm <- t(dtm) |
296 | 91x |
ats <- attributes(dtm)[c("opts", "WC", "type")] |
297 | 91x |
ats <- ats[!vapply(ats, is.null, TRUE)] |
298 | 91x |
atsn <- names(ats) |
299 | 91x |
ws <- if (is.null(term.filter)) colnames(dtm) else gsub(term.filter, "", colnames(dtm), perl = TRUE) |
300 | 91x |
if ((missing(to.lower) || !is.logical(to.lower)) && dict_chars$case != "mixed") { |
301 | 86x |
text_case <- if (any(grepl("[A-Z]", ws))) if (any(grepl("[a-z]", ws))) "mixed" else "upper" else "lower" |
302 | 86x |
if (text_case == "upper") { |
303 | 1x |
dict <- lapply(dict, toupper) |
304 | 1x |
dict_chars$case <- "upper" |
305 |
} |
|
306 | 86x |
to.lower <- text_case == "lower" |
307 |
} |
|
308 | 91x |
if (to.lower && dict_chars$case != "lower") { |
309 | ! |
dict <- lapply(dict, tolower) |
310 | ! |
dict_chars$case <- "lower" |
311 |
} |
|
312 | 87x |
if (dict_chars$case != "mixed") ws <- (if (dict_chars$case == "lower") tolower else toupper)(ws) |
313 | 91x |
odict <- dict |
314 | 91x |
boundaries <- FALSE |
315 | 91x |
formatdict <- function(dict, collapse = "|") { |
316 | 142x |
lab <- if (!escape) { |
317 | 38x |
lab <- lapply(dict, function(l) { |
318 | 321x |
if (!any(grepl("[][)(}{]", l))) { |
319 | 318x |
return(FALSE) |
320 |
} |
|
321 | 3x |
sl <- strsplit(l, "") |
322 | 3x |
!any(grepl("\\[.+\\]|\\(.+\\)|\\{.+\\}", l)) || any(vapply( |
323 | 3x |
sl, function(cs) { |
324 | 2x |
sum(sl == "[") != sum(sl == "]") & |
325 | 2x |
sum(sl == "{") != sum(sl == "}") & |
326 | 2x |
sum(sl == "(") != sum(sl == ")") |
327 |
}, |
|
328 | 3x |
TRUE |
329 |
)) |
|
330 |
}) |
|
331 | 38x |
Filter(isTRUE, lab) |
332 |
} else { |
|
333 | 104x |
logical() |
334 |
} |
|
335 | 142x |
if (!partial) { |
336 | 105x |
s <- "^" |
337 | 105x |
e <- "$" |
338 |
} else { |
|
339 | 37x |
s <- e <- "" |
340 |
} |
|
341 | 142x |
rec <- "([][)(}{*.^$+?\\|\\\\])" |
342 | 142x |
if (length(lab)) { |
343 | 1x |
for (l in names(lab)) dict[[l]][lab[[l]]] <- gsub("([][)(}{])", "\\\\\\1", dict[[l]][lab[[l]]]) |
344 | 1x |
rec <- "([*.^$+?\\|])" |
345 |
} |
|
346 | 142x |
res <- if (escape) { |
347 | 104x |
lapply(dict, function(l) { |
348 | 129x |
paste0(s, gsub(rec, "\\\\\\1", l, perl = TRUE), e, collapse = collapse) |
349 |
}) |
|
350 |
} else { |
|
351 | 38x |
lapply(dict, function(l) paste(paste0(s, gsub("([+*])[+*]+", "\\\\\\1+", l), e), collapse = collapse)) |
352 |
} |
|
353 | 142x |
if (glob) { |
354 | 101x |
lapply(res, function(l) { |
355 | 126x |
gsub(paste0( |
356 | 126x |
if (s == "^") "\\" else "", s, if (escape) "\\\\" else "", "\\*|", if (escape) "\\\\" else "", "\\*", if (e == "$") { |
357 |
"\\" |
|
358 |
} else { |
|
359 |
"" |
|
360 | 126x |
}, e |
361 | 126x |
), "", l) |
362 |
}) |
|
363 |
} else { |
|
364 | 41x |
res |
365 |
} |
|
366 |
} |
|
367 | 91x |
for (l in dict) { |
368 | 121x |
if (!boundaries) boundaries <- !any(grepl("^\\*|\\*$", l)) && any(grepl("^\\^|\\$$", l)) |
369 | 35x |
if (missing(partial) && boundaries) partial <- TRUE |
370 | 40x |
if (missing(glob) && (any(grepl("([][}{.^$+?\\|\\\\])", l)) || any(grepl("\\w\\*\\w", l)))) glob <- FALSE |
371 | 401x |
if (missing(escape) && (boundaries || any(grepl("[.])][+*]|[.+*]\\?|\\[\\^", l))) && |
372 | 401x |
!any(grepl("[({[][^])}]*$|^[^({[]*[])}]", l))) { |
373 | 36x |
escape <- FALSE |
374 |
} |
|
375 |
} |
|
376 | 91x |
cls <- 0 |
377 | 91x |
if (is.null(term.weights)) { |
378 | 64x |
cls <- structure(numeric(length(dict)), names = names(dict)) |
379 | 64x |
for (cat in seq_along(dict)) { |
380 | 356x |
ccls <- tryCatch(nchar(dict[[cat]]), error = function(e) NULL) |
381 | 356x |
if (is.null(ccls)) { |
382 | ! |
warning( |
383 | ! |
"dict appears to be misencoded, so results may not be as expected;\n", |
384 | ! |
'might try reading the dictionary in with encoding = "latin1"' |
385 |
) |
|
386 | ! |
dict[[cat]] <- iconv(dict[[cat]], sub = "#") |
387 | ! |
ccls <- nchar(dict[[cat]]) |
388 |
} |
|
389 | 356x |
cls[cat] <- sum(ccls) |
390 |
} |
|
391 |
} |
|
392 | 91x |
if (any(cls > term.break)) { |
393 | 3x |
br <- function(l, e = term.break) { |
394 | 3x |
f <- ceiling(cls[[l]] / e) |
395 | 3x |
l <- length(dict[[l]]) |
396 | 3x |
e <- ceiling(l / f) |
397 | 3x |
o <- lapply(seq_len(f), function(i) seq_len(e) + e * (i - 1)) |
398 | 3x |
o[[f]] <- o[[f]][o[[f]] <= l] |
399 | 3x |
o |
400 |
} |
|
401 | 3x |
op <- matrix(0, nrow(dtm), length(dict), dimnames = list(rownames(dtm), names(dict))) |
402 | ! |
if (coverage) cop <- op |
403 | 3x |
for (cat in names(dict)) { |
404 | 5x |
matches <- if (cls[[cat]] > term.break) { |
405 | 3x |
unique(unlist(lapply(br(cat), function(s) { |
406 | 52x |
grep(formatdict(list(dict[[cat]][s]))[[1]], ws, perl = TRUE) |
407 |
}))) |
|
408 |
} else { |
|
409 | 2x |
grep(formatdict(list(dict[[cat]])), ws, perl = TRUE) |
410 |
} |
|
411 | 5x |
if (length(matches)) { |
412 | 5x |
su <- dtm[, matches, drop = FALSE] |
413 | 5x |
op[, cat] <- rowSums(su, na.rm = TRUE) |
414 | ! |
if (coverage) cop[, cat] <- rowSums(su != 0, na.rm = TRUE) |
415 |
} |
|
416 |
} |
|
417 | 3x |
if (coverage) { |
418 | ! |
colnames(cop) <- paste0("coverage_", colnames(op)) |
419 | ! |
op <- cbind(op, cop) |
420 |
} |
|
421 |
} else { |
|
422 | 88x |
if (!is.null(term.weights)) { |
423 | 27x |
dict <- formatdict(dict, NULL) |
424 | 27x |
terms <- unique(unlist(dict)) |
425 | 27x |
termmap <- lapply(terms, grep, ws, perl = TRUE, value = TRUE) |
426 | 27x |
names(termmap) <- unique(unlist(odict)) |
427 | 27x |
termmap <- Filter(length, termmap) |
428 | 27x |
if (is.null(dim(term.weights))) { |
429 | 17x |
op <- matrix(0, nrow(dtm), length(dict), dimnames = list(rownames(dtm), names(dict))) |
430 | 17x |
if (length(termmap)) { |
431 | 16x |
weights <- lapply(names(term.weights), function(n) { |
432 | 25x |
l <- term.weights[[n]] |
433 | 25x |
if (is.null(names(l)) && n %in% names(dict) && length(dict[[n]]) == length(l)) { |
434 | 1x |
names(term.weights[[n]]) <- dict[[n]] |
435 | 1x |
l <- term.weights[[n]] |
436 |
} |
|
437 | 13x |
if (any(su <- !names(termmap) %in% names(l))) l[names(termmap)[su]] <- 0 |
438 | 25x |
do.call(c, lapply(names(termmap), function(p) { |
439 | 670x |
structure(rep(l[[p]], length(termmap[[p]])), names = termmap[[p]]) |
440 |
})) |
|
441 |
}) |
|
442 | 16x |
names(weights) <- names(term.weights) |
443 | 16x |
for (cat in names(dict)) { |
444 | 25x |
if (length(weights[[cat]])) { |
445 | 25x |
op[, cat] <- as.numeric(dtm[, names(weights[[cat]]), drop = FALSE] %*% weights[[cat]]) |
446 |
} |
|
447 |
} |
|
448 |
} |
|
449 |
} else { |
|
450 | 10x |
if (length(termmap)) { |
451 | 10x |
weights <- do.call(rbind, lapply(names(termmap), function(p) { |
452 | 65x |
matrix( |
453 | 65x |
rep(as.numeric(term.weights[p, ]), length(termmap[[p]])), |
454 | 65x |
ncol = ncol(term.weights), dimnames = list(termmap[[p]], colnames(term.weights)) |
455 |
) |
|
456 |
})) |
|
457 | 10x |
op <- matrix(0, nrow(dtm), ncol(weights), dimnames = list(rownames(dtm), colnames(weights))) |
458 | 10x |
for (cat in colnames(op)) { |
459 | 19x |
op[, cat] <- as.numeric(dtm[, rownames(weights), drop = FALSE] %*% weights[, cat]) |
460 |
} |
|
461 |
} else { |
|
462 | ! |
op <- matrix(0, nrow(dtm), length(dict), dimnames = list(rownames(dtm), colnames(weights))) |
463 |
} |
|
464 |
} |
|
465 |
} else { |
|
466 | 61x |
dict <- formatdict(dict) |
467 | 61x |
if (coverage) { |
468 | 1x |
op <- vapply(names(dict), function(cat) { |
469 | 3x |
su <- dtm[, grep(dict[[cat]], ws, perl = TRUE), drop = FALSE] |
470 | 3x |
c(rowSums(su != 0, na.rm = TRUE), rowSums(su, na.rm = TRUE)) |
471 | 1x |
}, numeric(nrow(dtm) * 2)) |
472 | 1x |
cop <- op[seq_len(nrow(dtm)), ] |
473 | 1x |
colnames(cop) <- paste0("coverage_", names(dict)) |
474 | 1x |
op <- cbind(op[-seq_len(nrow(dtm)), ], cop) |
475 |
} else { |
|
476 | 60x |
op <- vapply(names(dict), function(cat) { |
477 | 348x |
rowSums(dtm[, grep(dict[[cat]], ws, perl = TRUE), |
478 | 348x |
drop = FALSE |
479 | 348x |
], na.rm = TRUE) |
480 | 60x |
}, numeric(nrow(dtm))) |
481 |
} |
|
482 | 61x |
if (nrow(dtm) == 1) { |
483 | 7x |
op <- t(op) |
484 | 7x |
rownames(op) <- 1 |
485 |
} |
|
486 |
} |
|
487 |
} |
|
488 | 10x |
if (!is.null(bias)) for (n in names(bias)) if (n %in% colnames(op)) op[, n] <- op[, n] + bias[[n]] |
489 | 91x |
attr(op, "WC") <- if ("WC" %in% atsn) { |
490 | 43x |
ats$WC |
491 | 91x |
} else if (all(vapply(seq_len(ncol(dtm)), function(i) { |
492 | 3397x |
is.numeric(dtm[, i]) || is.integer(dtm[, i]) |
493 | 91x |
}, TRUE))) { |
494 | 48x |
rowSums(dtm, na.rm = TRUE) |
495 |
} else { |
|
496 | ! |
NULL |
497 |
} |
|
498 | 91x |
attr(op, "time") <- c(attr(dtm, "time"), termcat = proc.time()[[3]] - st) |
499 | 51x |
if ("type" %in% atsn) attr(op, "type") <- ats$type |
500 | 91x |
op |
501 |
} |
1 |
#' Generate a Report of Term Matches |
|
2 |
#' |
|
3 |
#' Extract matches to fuzzy terms (globs/wildcards or regular expressions) from provided text, in order |
|
4 |
#' to assess their appropriateness for inclusion in a dictionary. |
|
5 |
#' @param dict A vector of terms, list of such vectors, or a matrix-like object to be |
|
6 |
#' categorized by \code{\link{read.dic}}. |
|
7 |
#' @param text A vector of text to extract matches from. If not specified, will use the terms |
|
8 |
#' in the \code{term_map} retrieved from \code{\link{select.lspace}}. |
|
9 |
#' @param space A vector space used to calculate similarities between term matches. |
|
10 |
#' Name of a the space (see \code{\link{select.lspace}}), a matrix with terms as row names, or |
|
11 |
#' \code{TRUE} to auto-select a space based on matched terms. |
|
12 |
#' @param glob Logical; if \code{TRUE}, converts globs (asterisk wildcards) to regular expressions. |
|
13 |
#' If not specified, this will be set automatically. |
|
14 |
#' @param parse_phrases Logical; if \code{TRUE} (default) and \code{space} is specified, will |
|
15 |
#' break unmatched phrases into single terms, and average across and matched vectors. |
|
16 |
#' @param tolower Logical; if \code{FALSE}, will retain \code{text}'s case. |
|
17 |
#' @param punct Logical; if \code{FALSE}, will remove punctuation markings in \code{text}. |
|
18 |
#' @param special Logical; if \code{FALSE}, will attempt to replace special characters in \code{text}. |
|
19 |
#' @param as_terms Logical; if \code{TRUE}, will treat \code{text} as terms, meaning \code{dict} |
|
20 |
#' terms will only count as matches when matching the complete text. |
|
21 |
#' @param bysentence Logical; if \code{TRUE}, will split \code{text} into sentences, and only |
|
22 |
#' consider unique sentences. |
|
23 |
#' @param as_string Logical; if \code{FALSE}, returns matches as tables rather than a string. |
|
24 |
#' @param outFile File path to write results to, always ending in \code{.csv}. |
|
25 |
#' @param space_dir Directory from which \code{space} should be loaded. |
|
26 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages. |
|
27 |
#' @note |
|
28 |
#' Matches are extracted for each term independently, so they may not align with some implementations |
|
29 |
#' of dictionaries. For instance, by default \code{\link{lma_patcat}} matches destructively, and sorts |
|
30 |
#' terms by length such that shorter terms will not match the same text and longer terms that overlap. |
|
31 |
#' Here, the match would show up for both terms. |
|
32 |
#' @returns A \code{data.frame} of results, with a row for each unique term, and the following columns: |
|
33 |
#' \itemize{ |
|
34 |
#' \item \strong{\code{term}}: The originally entered term. |
|
35 |
#' \item \strong{\code{regex}}: The converted and applied regular expression form of the term. |
|
36 |
#' \item \strong{\code{categories}}: Comma-separated category names, |
|
37 |
#' if \code{dict} is a list with named entries. |
|
38 |
#' \item \strong{\code{count}}: Total number of matches to the term. |
|
39 |
#' \item \strong{\code{max_count}}: Number of matches to the most representative |
|
40 |
#' (that with the highest average similarity) variant of the term. |
|
41 |
#' \item \strong{\code{variants}}: Number of variants of the term. |
|
42 |
#' \item \strong{\code{space}}: Name of the latent semantic space, if one was used. |
|
43 |
#' \item \strong{\code{mean_sim}}: Average similarity to the most representative variant among terms |
|
44 |
#' found in the space, if one was used. |
|
45 |
#' \item \strong{\code{min_sim}}: Minimal similarity to the most representative variant. |
|
46 |
#' \item \strong{\code{matches}}: Variants, with counts and similarity (Pearson's r) to the |
|
47 |
#' most representative term (if a space was specified). Either in the form of a comma-separated |
|
48 |
#' string or a \code{data.frame} (if \code{as_string} is \code{FALSE}). |
|
49 |
#' } |
|
50 |
#' @examples |
|
51 |
#' text <- c( |
|
52 |
#' "I am sadly homeless, and suffering from depression :(", |
|
53 |
#' "This wholesome happiness brings joy to my heart! :D:D:D", |
|
54 |
#' "They are joyous in these fearsome happenings D:", |
|
55 |
#' "I feel weightless now that my sadness has been depressed! :()" |
|
56 |
#' ) |
|
57 |
#' dict <- list( |
|
58 |
#' sad = c("*less", "sad*", "depres*", ":("), |
|
59 |
#' happy = c("*some", "happ*", "joy*", "d:"), |
|
60 |
#' self = c("i *", "my *") |
|
61 |
#' ) |
|
62 |
#' |
|
63 |
#' report_term_matches(dict, text) |
|
64 |
#' @export |
|
65 | ||
66 |
report_term_matches <- function(dict, text = NULL, space = NULL, glob = TRUE, |
|
67 |
parse_phrases = TRUE, tolower = TRUE, punct = TRUE, special = TRUE, |
|
68 |
as_terms = FALSE, bysentence = FALSE, as_string = TRUE, outFile = NULL, |
|
69 |
space_dir = getOption("lingmatch.lspace.dir"), verbose = TRUE) { |
|
70 | ! |
if (missing(dict)) stop("dict must be specified", .call = FALSE) |
71 | 5x |
if (is.null(text)) { |
72 | ! |
text <- rownames(select.lspace(dir = space_dir, get.map = TRUE)$term_map) |
73 | ! |
as_terms <- TRUE |
74 |
} |
|
75 | ! |
if (is.null(text) && is.null(space)) stop("either text or space must be specified", .call = FALSE) |
76 | 5x |
st <- proc.time()[[3]] |
77 | 5x |
if (!is.null(text) && !as_terms) { |
78 | 4x |
if (verbose) cat("preparing text (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
79 | ! |
if (bysentence) text <- read.segments(text, segment.size = 1, bysentence = TRUE)$text |
80 | 4x |
if (tolower) text <- tolower(text) |
81 | ! |
if (!punct) text <- gsub("[,_:;/\\\\.?!\"()\\{}[]|\\]", " ", text) |
82 | ! |
if (!special) text <- lma_dict("special", as.function = gsub)(text) |
83 | 4x |
text <- unique(text) |
84 |
} |
|
85 | 5x |
if (verbose) cat("preparing dict (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
86 | ! |
if (!is.null(dim(dict)) || (is.character(dict) && length(dict) == 1 && file.exists(dict))) dict <- read.dic(dict) |
87 | 5x |
terms <- data.frame(term = unique(unlist(dict, use.names = FALSE))) |
88 | 5x |
if (missing(glob)) { |
89 | 5x |
glob <- any(grepl("^\\*", terms$term)) |
90 | 5x |
if (!glob) { |
91 | 1x |
glob <- any(grepl("\\*$", terms$term)) |
92 | ! |
if (glob) glob <- !any(grepl("(?:\\\\\\w|[].})])\\*$", terms$term)) |
93 |
} |
|
94 |
} |
|
95 | 5x |
space_name <- NULL |
96 | 5x |
if (is.null(text)) { |
97 | ! |
if (is.character(space)) { |
98 | ! |
space_name <- space |
99 | ! |
if (verbose) cat("preloading space (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
100 | ! |
space <- lma_lspace(space, dir = space_dir) |
101 |
} |
|
102 | ! |
text <- rownames(space) |
103 |
} |
|
104 | 5x |
rawtext <- is.null(space_name) && !as_terms |
105 | 5x |
terms$regex <- to_regex(list(terms$term), TRUE, glob)[[1]] |
106 | 5x |
terms <- terms[!is.na(terms$regex) & terms$regex != "", ] |
107 | 5x |
terms$regex <- if (rawtext) paste0("\\b", terms$regex, "\\b") else paste0("^", terms$regex, "$") |
108 | 5x |
if (is.list(dict)) { |
109 | ! |
if (is.null(names(dict))) names(dict) <- paste0("cat_", seq_along(dict)) |
110 | 4x |
categories <- character(nrow(terms)) |
111 | 4x |
for (cat in names(dict)) { |
112 | 16x |
su <- terms$term %in% dict[[cat]] |
113 | 16x |
if (any(su)) { |
114 | 16x |
ssu <- su & categories != "" |
115 | 16x |
categories[ssu] <- paste0(categories[ssu], ", ", cat) |
116 | 16x |
categories[su & categories == ""] <- cat |
117 |
} |
|
118 |
} |
|
119 | 4x |
terms$categories <- categories |
120 |
} |
|
121 | 5x |
if (verbose) cat("extracting matches (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
122 | 5x |
matches <- extract_matches(terms$regex, text, rawtext) |
123 | 5x |
has_space <- FALSE |
124 | 5x |
if (!is.null(space)) { |
125 | 1x |
obs <- unique(unlist(lapply(matches, names), use.names = FALSE)) |
126 | 1x |
if (is.null(space_name)) { |
127 | 1x |
if (is.logical(space) && space) { |
128 | 1x |
space <- select.lspace(terms = obs)$selected |
129 | 1x |
space <- if (nrow(space)) rownames(space)[1] else NULL |
130 |
} |
|
131 | 1x |
if (is.character(space)) { |
132 | 1x |
if (verbose) cat("loading space (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
133 | 1x |
space_name <- space |
134 | 1x |
space <- lma_lspace(obs, space, dir = space_dir) |
135 |
} |
|
136 |
} |
|
137 | ! |
if (!nrow(space) || !any(obs %in% rownames(space))) space <- NULL |
138 | 1x |
if (is.null(space)) { |
139 | ! |
warning("failed to recognize space") |
140 |
} else { |
|
141 | 1x |
su <- obs %in% rownames(space) |
142 | 1x |
if (parse_phrases && any(!su)) { |
143 | 1x |
phrase <- grepl("[ _/-]", obs) |
144 | 1x |
if (any(phrase)) { |
145 | 1x |
if (verbose) cat("parsing phrases (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
146 | 1x |
split_parts <- strsplit(obs[phrase], "[ _/-]") |
147 | 1x |
parts <- unique(unlist(split_parts, use.names = FALSE)) |
148 | 1x |
part_vectors <- if (!is.null(space_name)) { |
149 | 1x |
lma_lspace(parts, space_name) |
150 |
} else { |
|
151 | ! |
if (any(parts %in% rownames(space))) space[parts[parts %in% rownames(space)]] else space[0, ] |
152 |
} |
|
153 | 1x |
if (nrow(part_vectors)) { |
154 | 1x |
space_terms <- rownames(part_vectors) |
155 | 1x |
space_dim <- ncol(space) |
156 | 1x |
names(split_parts) <- obs[phrase] |
157 | 1x |
agg_vectors <- t(vapply(split_parts, function(p) { |
158 | 4x |
su <- p %in% space_terms |
159 | 4x |
if (any(su)) { |
160 | 4x |
colMeans(part_vectors[p[su], , drop = FALSE]) |
161 |
} else { |
|
162 | ! |
numeric(space_dim) |
163 |
} |
|
164 | 1x |
}, numeric(space_dim))) |
165 | 1x |
space <- rbind(space, agg_vectors[rowSums(agg_vectors) != 0, ]) |
166 |
} |
|
167 |
} |
|
168 |
} |
|
169 | ! |
if (is.null(space_name)) space_name <- "custom" |
170 | 1x |
has_space <- TRUE |
171 |
} |
|
172 |
} |
|
173 | 5x |
if (verbose) cat("preparing results (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
174 | 5x |
terms <- cbind(terms, do.call(rbind, lapply(matches, function(m) { |
175 | 45x |
hits <- if (length(m)) { |
176 | 29x |
if (has_space) { |
177 | 9x |
msim <- m |
178 | 9x |
if (length(m) == 1) { |
179 | 1x |
msim[] <- 1L |
180 |
} else { |
|
181 | 8x |
msim[] <- NA |
182 | 8x |
su <- names(m) %in% rownames(space) |
183 | 8x |
if (sum(su) == 1) { |
184 | ! |
msim[su] <- 1L |
185 | 8x |
} else if (any(su)) { |
186 | 8x |
ns <- names(m)[su] |
187 | 8x |
sims <- lma_simets(space[ns, ], metric = "pearson", symmetrical = TRUE) |
188 | 8x |
msim[su] <- as.numeric(sims[, which.max(colMeans(sims))]) |
189 |
} |
|
190 |
} |
|
191 | 9x |
o <- order(msim, decreasing = TRUE) |
192 | 9x |
m <- m[o] |
193 | 9x |
if (as_string) { |
194 | 9x |
paste(paste0(names(m), " (", if (!as_terms) paste0(m, ", "), round(msim[o], 2), ")"), collapse = ", ") |
195 |
} else { |
|
196 | ! |
list(as.data.frame(rbind(m, msim[o]))) |
197 |
} |
|
198 |
} else { |
|
199 | 20x |
m <- sort(m, TRUE) |
200 | 20x |
if (as_string) { |
201 | 11x |
paste(paste0(names(m), if (!as_terms) paste0(" (", m, ")")), collapse = ", ") |
202 |
} else { |
|
203 | 9x |
list(t(as.data.frame(m))) |
204 |
} |
|
205 |
} |
|
206 |
} else { |
|
207 | 2x |
if (has_space) msim <- NA |
208 | 2x |
if (as_string) "" else list(data.frame()) |
209 |
} |
|
210 | 11x |
if (!as_string) rownames(hits[[1]]) <- NULL |
211 | 45x |
res <- data.frame( |
212 | 45x |
count = sum(m), |
213 | 45x |
max_count = if (all(is.na(m))) 0L else max(m, na.rm = TRUE), |
214 | 45x |
variants = length(m) |
215 |
) |
|
216 | 45x |
if (has_space) { |
217 | 11x |
res$space <- space_name |
218 | 11x |
if (all(is.na(msim))) { |
219 | 2x |
res$mean_sim <- res$min_sim <- NA |
220 |
} else { |
|
221 | 9x |
res$mean_sim <- mean(msim, na.rm = TRUE) |
222 | 9x |
res$min_sim <- min(msim, na.rm = TRUE) |
223 |
} |
|
224 |
} |
|
225 | 45x |
res[["matches"]] <- hits |
226 | 45x |
res |
227 |
}))) |
|
228 | 5x |
terms <- terms[if (has_space) order(terms$mean_sim) else order(terms$variants, decreasing = TRUE), ] |
229 | 5x |
rownames(terms) <- NULL |
230 | 5x |
if (!is.null(outFile)) { |
231 | 1x |
outFile <- paste0(sub(".csv", "", outFile, fixed = TRUE), ".csv") |
232 | 1x |
if (verbose) cat("writing results: ", outFile, " (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
233 | 1x |
write.table(terms, outFile, sep = ",", row.names = FALSE, qmethod = "double") |
234 |
} |
|
235 | 5x |
if (verbose) cat("done (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
236 | 5x |
terms |
237 |
} |
1 |
#' Document-Term Matrix Creation |
|
2 |
#' |
|
3 |
#' Creates a document-term matrix (dtm) from a set of texts. |
|
4 |
#' @param text Texts to be processed. This can be a vector (such as a column in a data frame) |
|
5 |
#' or list. When a list, these can be in the form returned with \code{tokens.only = TRUE}, |
|
6 |
#' or a list with named vectors, where names are tokens and values are frequencies or the like. |
|
7 |
#' @param exclude A character vector of words to be excluded. If \code{exclude} is a single string |
|
8 |
#' matching \code{'function'}, \code{lma_dict(1:9)} will be used. |
|
9 |
#' @param context A character vector used to reformat text based on look- ahead/behind. For example, |
|
10 |
#' you might attempt to disambiguate \emph{like} by reformatting certain \emph{like}s |
|
11 |
#' (e.g., \code{context = c('(i) like*', '(you) like*', '(do) like')}, where words in parentheses are |
|
12 |
#' the context for the target word, and asterisks denote partial matching). This would be converted |
|
13 |
#' to regular expression (i.e., \code{'(? <= i) like\\\\b'}) which, if matched, would be |
|
14 |
#' replaced with a coded version of the word (e.g., \code{"Hey, i like that!"} would become |
|
15 |
#' \code{"Hey, i i-like that!"}). This would probably only be useful for categorization, where a |
|
16 |
#' dictionary would only include one or another version of a word (e.g., the LIWC 2015 dictionary |
|
17 |
#' does something like this with \emph{like}, and LIWC 2007 did something like this with |
|
18 |
#' \emph{kind (of)}, both to try and clean up the posemo category). |
|
19 |
#' @param replace.special Logical: if \code{TRUE}, special characters are replaced with regular |
|
20 |
#' equivalents using the \code{\link{lma_dict}} special function. |
|
21 |
#' @param numbers Logical: if \code{TRUE}, numbers are preserved. |
|
22 |
#' @param punct Logical: if \code{TRUE}, punctuation is preserved. |
|
23 |
#' @param urls Logical: if \code{FALSE}, attempts to replace all urls with "repurl". |
|
24 |
#' @param emojis Logical: if \code{TRUE}, attempts to replace emojis (e.g., ":(" would be replaced |
|
25 |
#' with "repfrown"). |
|
26 |
#' @param to.lower Logical: if \code{FALSE}, words with different capitalization are treated as |
|
27 |
#' different terms. |
|
28 |
#' @param word.break A regular expression string determining the way words are split. Default is |
|
29 |
#' \code{' +'} which breaks words at one or more blank spaces. You may also like to break by |
|
30 |
#' dashes or slashes (\code{'[ /-]+'}), depending on the text. |
|
31 |
#' @param dc.min Numeric: excludes terms appearing in the set number or fewer documents. |
|
32 |
#' Default is 0 (no limit). |
|
33 |
#' @param dc.max Numeric: excludes terms appearing in the set number or more. Default |
|
34 |
#' is Inf (no limit). |
|
35 |
#' @param sparse Logical: if \code{FALSE}, a regular dense matrix is returned. |
|
36 |
#' @param tokens.only Logical: if \code{TRUE}, returns a list rather than a matrix, with these entries: |
|
37 |
#' \tabular{ll}{ |
|
38 |
#' \code{tokens} \tab A vector of indices with terms as names. \cr |
|
39 |
#' \code{frequencies} \tab A vector of counts with terms as names. \cr |
|
40 |
#' \code{WC} \tab A vector of term counts for each document. \cr |
|
41 |
#' \code{indices} \tab A list with a vector of token indices for each document. \cr |
|
42 |
#' } |
|
43 |
#' @note |
|
44 |
#' This is a relatively simple way to make a dtm. To calculate the (more or less) standard forms of |
|
45 |
#' LSM and LSS, a somewhat raw dtm should be fine, because both processes essentially use |
|
46 |
#' dictionaries (obviating stemming) and weighting or categorization (largely obviating 'stop word' |
|
47 |
#' removal). The exact effect of additional processing will depend on the dictionary/semantic space |
|
48 |
#' and weighting scheme used (particularly for LSA). This function also does some processing which |
|
49 |
#' may matter if you plan on categorizing with categories that have terms with look- ahead/behind assertions |
|
50 |
#' (like LIWC dictionaries). Otherwise, other methods may be faster, more memory efficient, and/or more featureful. |
|
51 |
#' @return A sparse matrix (or regular matrix if \code{sparse = FALSE}), with a row per \code{text}, |
|
52 |
#' and column per term, or a list if \code{tokens.only = TRUE}. Includes an attribute with options (\code{opts}), |
|
53 |
#' and attributes with word count (\code{WC}) and column sums (\code{colsums}) if \code{tokens.only = FALSE}. |
|
54 |
#' @examples |
|
55 |
#' text <- c( |
|
56 |
#' "Why, hello there! How are you this evening?", |
|
57 |
#' "I am well, thank you for your inquiry!", |
|
58 |
#' "You are a most good at social interactions person!", |
|
59 |
#' "Why, thank you! You're not all bad yourself!" |
|
60 |
#' ) |
|
61 |
#' |
|
62 |
#' lma_dtm(text) |
|
63 |
#' |
|
64 |
#' # return tokens only |
|
65 |
#' (tokens <- lma_dtm(text, tokens.only = TRUE)) |
|
66 |
#' |
|
67 |
#' ## convert those to a regular DTM |
|
68 |
#' lma_dtm(tokens) |
|
69 |
#' |
|
70 |
#' # convert a list-representation to a sparse matrix |
|
71 |
#' lma_dtm(list( |
|
72 |
#' doc1 = c(why = 1, hello = 1, there = 1), |
|
73 |
#' doc2 = c(i = 1, am = 1, well = 1) |
|
74 |
#' )) |
|
75 |
#' @export |
|
76 | ||
77 |
lma_dtm <- function(text, exclude = NULL, context = NULL, replace.special = FALSE, numbers = FALSE, |
|
78 |
punct = FALSE, urls = TRUE, emojis = FALSE, to.lower = TRUE, word.break = " +", dc.min = 0, |
|
79 |
dc.max = Inf, sparse = TRUE, tokens.only = FALSE) { |
|
80 | 80x |
if (!is.null(dim(text))) { |
81 | ! |
if (is.character(text[, 1]) || is.factor(text[, 1])) { |
82 | ! |
text <- text[, 1] |
83 |
} else { |
|
84 | ! |
stop("enter a vector of texts as the first argument") |
85 |
} |
|
86 |
} |
|
87 | 80x |
if (is.list(text)) { |
88 | 4x |
if (all(c("tokens", "indices") %in% names(text))) { |
89 | 2x |
m <- do.call(rbind, lapply(seq_along(text$indices), function(i) { |
90 | 30x |
if (length(text$indices[[i]])) { |
91 | 28x |
inds <- as.factor(text$indices[[i]]) |
92 | 28x |
cbind(i, as.integer(levels(inds)), tabulate(inds)) |
93 |
} |
|
94 |
})) |
|
95 | 2x |
dtm <- sparseMatrix(m[, 1], m[, 2], |
96 | 2x |
x = m[, 3], dims = c(length(text$indices), length(text$tokens)), |
97 | 2x |
dimnames = list(NULL, if (is.character(text$tokens)) text$tokens else names(text$tokens)) |
98 |
) |
|
99 | ! |
if (!sparse) dtm <- as.matrix(dtm) |
100 | 2x |
attr(dtm, "colsums") <- text$frequencies |
101 | 2x |
attr(dtm, "type") <- "count" |
102 | 2x |
attr(dtm, "WC") <- text$WC |
103 | 2x |
attr(dtm, "opts") <- attr(text, "opts") |
104 | 2x |
attr(dtm, "time") <- attr(text, "time") |
105 | 2x |
return(dtm) |
106 |
} else { |
|
107 | 2x |
tokens <- unlist(unname(text), recursive = FALSE) |
108 | 2x |
cinds <- unique(names(tokens)) |
109 | 2x |
if (is.null(cinds)) { |
110 | 1x |
text <- unlist(text) |
111 |
} else { |
|
112 | 1x |
rinds <- rep(seq_along(text), vapply(text, length, 0)) |
113 | 1x |
cinds <- structure(seq_along(cinds), names = cinds) |
114 | 1x |
dtm <- sparseMatrix( |
115 | 1x |
rinds, cinds[names(tokens)], |
116 | 1x |
x = tokens, |
117 | 1x |
dims = c(length(text), length(cinds)), dimnames = list(names(text), names(cinds)) |
118 |
) |
|
119 | 1x |
if (!sparse) dtm <- as.matrix(dtm) |
120 | 1x |
return(dtm) |
121 |
} |
|
122 |
} |
|
123 |
} |
|
124 | ! |
if (is.null(text)) stop(substitute(text), " not found") |
125 | 77x |
docnames <- names(text) |
126 | 77x |
if (is.character(text) && all(nchar(text) < 500) && all(file.exists(text))) { |
127 | 1x |
text <- if (length(text) != 1 || dir.exists(text)) read.segments(text) else readLines(text) |
128 |
} |
|
129 | 77x |
text <- paste(" ", text, " ") |
130 | 77x |
st <- proc.time()[[3]] |
131 | ! |
if (replace.special) text <- lma_dict("special", as.function = gsub)(text) |
132 | 77x |
text <- gsub("(?<=[^a-z0-9])'|'(?=[^a-z0-9])", '"', text, TRUE, TRUE) |
133 | 77x |
if (!urls) { |
134 | 12x |
text <- gsub(paste0( |
135 | 12x |
"\\s[a-z]+://[^\\s]*|www\\.[^\\s]*|\\s[a-z_~-]+\\.[a-z_~-]{2,}[^\\s]*|\\s[a-z_~-]+\\.", |
136 | 12x |
"(?:io|com|net|org|gov|edu)\\s" |
137 | 12x |
), " repurl ", text, TRUE, TRUE) |
138 | 12x |
text <- gsub("(?<=[A-Z])\\.\\s", " ", text, perl = TRUE) |
139 |
} |
|
140 | 77x |
text <- gsub("\\s+", " ", text) |
141 | 77x |
text <- gsub("\\s(etc|st|rd|ft|feat|dr|drs|mr|ms|mrs|messrs|jr|prof)\\.", " \\1tempperiod", text, TRUE) |
142 | 77x |
text <- gsub("\\s\\.|\\.\\s", " . ", text) |
143 | 77x |
if (any(punct, emojis, !is.null(context))) { |
144 | 22x |
special <- lma_dict(special)[[1]] |
145 | 22x |
if (!missing(context) && length(context) == 1 && grepl("like", context, TRUE)) { |
146 | ! |
context <- special[["LIKE"]] |
147 |
} |
|
148 | 18x |
if (punct) text <- gsub(special[["ELLIPSIS"]], " repellipsis ", text) |
149 | 22x |
if (emojis) { |
150 | 2x |
for (type in c("SMILE", "FROWN")) { |
151 | 4x |
text <- gsub( |
152 | 4x |
special[[type]], paste0(" rep", tolower(type), " "), text, |
153 | 4x |
perl = TRUE |
154 |
) |
|
155 |
} |
|
156 |
} |
|
157 | 22x |
if (!missing(context)) { |
158 | 2x |
if (!any(grepl("[?=]", context))) { |
159 | 1x |
context <- gsub("^\\(", "(?<=", context) |
160 | 1x |
context <- gsub("\\((?!\\?)", "(?=", context, perl = TRUE) |
161 | 1x |
context <- gsub("(?<![)*])$", "\\\\b", context, perl = TRUE) |
162 | 1x |
context <- gsub("\\*", "\\\\w*", context, perl = TRUE) |
163 |
} |
|
164 | 2x |
context <- structure( |
165 | 2x |
as.list(context), |
166 | 2x |
names = paste("", gsub("--+", "-", gsub("[\\s^[]|\\\\s]", "-", |
167 | 2x |
gsub("[^a-z0-9\\s\\\\']|\\\\[wbs]", "", context, TRUE, TRUE), |
168 | 2x |
perl = TRUE |
169 |
)), "") |
|
170 |
) |
|
171 | 2x |
for (rn in names(context)) text <- gsub(context[[rn]], rn, text, perl = TRUE) |
172 |
} |
|
173 |
} |
|
174 | 73x |
if (to.lower) text <- tolower(text) |
175 | 77x |
if (!is.null(exclude)) { |
176 | 3x |
if (length(exclude) == 1 && grepl(exclude, "function", TRUE)) { |
177 | 1x |
exclude <- unlist(lma_dict(), use.names = FALSE) |
178 | 2x |
} else if (is.list(exclude)) exclude <- unlist(exclude, use.names = FALSE) |
179 |
} |
|
180 | 64x |
if (!numbers) text <- gsub("[[:punct:]]*[0-9][0-9,.el-]*", " ", text, TRUE, TRUE) |
181 | 77x |
text <- gsub(paste0( |
182 | 77x |
"([^a-z0-9.,':/?=#\\s-]|[:/?=#](?=\\s)|(?:(?<=\\s)[:/=-]|,)(?=[a-z])|(?<=[^a-z0-9])", |
183 | 77x |
"(,(?=[a-z0-9])|[.-](?=[a-z]))|[.,'-](?=[^0-9a-z]|[.,'-]))" |
184 | 77x |
), " \\1 ", text, TRUE, TRUE) |
185 | 77x |
text <- gsub("(\\s[a-z]+)/([a-z]+\\s)", " \\1 / \\2 ", text, TRUE, TRUE) |
186 | 77x |
text <- gsub("([a-z0-9.,'-].*[^a-z0-9])", " \\1 ", text, TRUE, TRUE) |
187 | 77x |
text <- gsub("(?<=[a-z])\\s['\u00E7\u00ED]\\s(?=[a-z])", "'", text, TRUE, TRUE) |
188 | 77x |
if (!punct) { |
189 | 59x |
text <- gsub("[^A-Za-z0-9'._-]+", " ", text) |
190 | 59x |
text <- gsub("(?=[a-z])\\.+|(?<=[^a-z0-9])['._-]+|'+\\s", " ", text, TRUE, TRUE) |
191 |
} |
|
192 | 77x |
text <- gsub("tempperiod", ".", text, fixed = TRUE) |
193 | 77x |
text <- gsub("^\\s+|\\s(?=\\s)|\\s+$", "", text, perl = TRUE) |
194 | 77x |
text <- strsplit(text, word.break) |
195 | 77x |
words <- sort(unique(unlist(text))) |
196 | 77x |
words <- words[!words == ""] |
197 | 77x |
if (!is.null(exclude)) { |
198 | ! |
if (is.list(exclude)) exclude <- unlist(exclude, use.names = FALSE) |
199 | ! |
if (!any(grepl("^", exclude, fixed = TRUE))) exclude <- gsub("\\^\\*|\\*\\$", "", paste0("^", exclude, "$")) |
200 | 3x |
if (any(ck <- grepl("[[({]", exclude) + grepl("[})]|\\]", exclude) == 1)) { |
201 | ! |
exclude[ck] <- gsub("([([{}\\])])", "\\\\\\1", exclude[ck], perl = TRUE) |
202 |
} |
|
203 | 3x |
words <- grep(paste(exclude, collapse = "|"), words, value = TRUE, invert = TRUE) |
204 |
} |
|
205 | 77x |
if (tokens.only) { |
206 | 2x |
m <- match_terms( |
207 | 2x |
text, words, !grepl("^[[:punct:]]$|^repellipsis$", words), |
208 | 2x |
c(length(text), length(words)), is.null(exclude), TRUE |
209 |
) |
|
210 | 2x |
names(m) <- c("tokens", "frequencies", "WC", "indices") |
211 | 2x |
m$tokens <- m$tokens + 1L |
212 | 2x |
m$tokens <- sort(m$tokens) |
213 | 2x |
inds <- vector("list", length(text)) |
214 | 2x |
l <- 0 |
215 | 2x |
for (i in seq_along(inds)) { |
216 | 30x |
if (m$WC[i]) { |
217 | 28x |
inds[[i]] <- m$indices[seq_len(m$WC[i]) + l] + 1L |
218 | 28x |
l <- l + m$WC[i] |
219 |
} else { |
|
220 | 2x |
inds[[i]] <- integer() |
221 |
} |
|
222 |
} |
|
223 | 2x |
m$indices <- inds |
224 | 2x |
if (dc.min > 0 || dc.max < Inf) { |
225 | 1x |
su <- m$frequencies > dc.min & m$frequencies < dc.max |
226 | 1x |
if (any(!su)) { |
227 | 1x |
if (!any(su)) { |
228 | ! |
warning( |
229 | ! |
"document count bounds [", dc.min, ", ", dc.max, "] exlcuded all terms, so they were ignored", |
230 | ! |
call. = FALSE |
231 |
) |
|
232 |
} else { |
|
233 | 1x |
m$frequencies <- m$frequencies[su] |
234 | 1x |
ex <- m$tokens[!su] |
235 | 1x |
m$tokens <- m$tokens[su] |
236 | 1x |
new_inds <- structure(seq_along(m$tokens), names = m$tokens) |
237 | 1x |
m$tokens[] <- new_inds |
238 | 1x |
for (i in seq_along(m$indices)) { |
239 | 15x |
inds <- m$indices[[i]] |
240 | 15x |
m$indices[[i]] <- unname(new_inds[as.character(inds[!inds %in% ex])]) |
241 | 15x |
m$WC[i] <- length(m$indices[[i]]) |
242 |
} |
|
243 |
} |
|
244 |
} |
|
245 |
} |
|
246 | 2x |
names(m$frequencies) <- names(m$tokens) |
247 |
} else { |
|
248 | 75x |
msu <- match_terms( |
249 | 75x |
text, words, !grepl("^[[:punct:]]$|^repellipsis$", words), |
250 | 75x |
c(length(text), length(words)), is.null(exclude), FALSE |
251 |
) |
|
252 | 75x |
m <- if (sparse) as(msu[[1]], "CsparseMatrix") else as.matrix(msu[[1]]) |
253 | 2x |
if (length(docnames) == nrow(m)) rownames(m) <- docnames |
254 | 75x |
su <- msu[[3]] > dc.min & msu[[3]] < dc.max |
255 | 75x |
names(msu[[3]]) <- words |
256 | 75x |
if (any(!su)) { |
257 | 1x |
if (!any(su)) { |
258 | ! |
warning( |
259 | ! |
"document count bounds [", dc.min, ", ", dc.max, "] exlcuded all terms, so they were ignored", |
260 | ! |
call. = FALSE |
261 |
) |
|
262 |
} else { |
|
263 | 1x |
m <- m[, su, drop = FALSE] |
264 |
} |
|
265 |
} |
|
266 | 75x |
attr(m, "WC") <- unlist(msu[[2]], use.names = FALSE) |
267 | 75x |
attr(m, "colsums") <- msu[[3]] |
268 | 75x |
attr(m, "type") <- "count" |
269 | 75x |
if (!missing(dc.min) || !missing(dc.max)) { |
270 | 1x |
attr(m, "info") <- paste( |
271 | 1x |
"a lim of", dc.min, "and", dc.max, "left", sum(su), "of", length(words), "unique terms" |
272 |
) |
|
273 |
} |
|
274 |
} |
|
275 | 77x |
attr(m, "opts") <- c(numbers = numbers, punct = punct, urls = urls, to.lower = to.lower) |
276 | 77x |
attr(m, "time") <- c(dtm = proc.time()[[3]] - st) |
277 | 77x |
m |
278 |
} |
1 |
#' Categorize Texts |
|
2 |
#' |
|
3 |
#' Categorize raw texts using a pattern-based dictionary. |
|
4 |
#' |
|
5 |
#' @param text A vector of text to be categorized. Texts are padded by 2 spaces, and potentially lowercased. |
|
6 |
#' @param dict At least a vector of terms (patterns), usually a matrix-like object with columns for terms, |
|
7 |
#' categories, and weights. |
|
8 |
#' @param pattern.weights A vector of weights corresponding to terms in \code{dict}, or the column name of |
|
9 |
#' weights found in \code{dict}. |
|
10 |
#' @param pattern.categories A vector of category names corresponding to terms in \code{dict}, or the column name of |
|
11 |
#' category names found in \code{dict}. |
|
12 |
#' @param bias A constant to add to each category after weighting and summing. Can be a vector with names |
|
13 |
#' corresponding to the unique values in \code{dict[, category]}, but is usually extracted from dict based |
|
14 |
#' on the intercept included in each category (defined by \code{name.map['intname']}). |
|
15 |
#' @param to.lower Logical indicating whether \code{text} should be converted to lowercase before processing. |
|
16 |
#' @param return.dtm Logical; if \code{TRUE}, only a document-term matrix will be returned, rather than the |
|
17 |
#' weighted, summed, and biased category values. |
|
18 |
#' @param exclusive Logical; if \code{FALSE}, each dictionary term is searched for in the original text. |
|
19 |
#' Otherwise (by default), terms are sorted by length (with longer terms being searched for first), and |
|
20 |
#' matches are removed from the text (avoiding subsequent matches to matched patterns). |
|
21 |
#' @param drop.zeros logical; if \code{TRUE}, categories or terms with no matches will be removed. |
|
22 |
#' @param boundary A string to add to the beginning and end of each dictionary term. If \code{TRUE}, |
|
23 |
#' \code{boundary} will be set to \code{' '}, avoiding pattern matches within words. By default, dictionary |
|
24 |
#' terms are left as entered. |
|
25 |
#' @param fixed Logical; if \code{FALSE}, patterns are treated as regular expressions. |
|
26 |
#' @param globtoregex Logical; if \code{TRUE}, initial and terminal asterisks are replaced with \code{\\\\b\\\\w*} |
|
27 |
#' and \code{\\\\w*\\\\b} respectively. This will also set \code{fixed} to \code{FALSE} unless fixed is specified. |
|
28 |
#' @param name.map A named character vector: |
|
29 |
#' \itemize{ |
|
30 |
#' \item \strong{\code{intname}}: term identifying category biases within the term list; |
|
31 |
#' defaults to \code{'_intercept'} |
|
32 |
#' \item \strong{\code{term}}: name of the column containing terms in \code{dict}; defaults to \code{'term'} |
|
33 |
#' } |
|
34 |
#' Missing names are added, so names can be specified positional (e.g., \code{c('_int',} \code{'terms')}), |
|
35 |
#' or only some can be specified by name (e.g., \code{c(term =} \code{'patterns')}), leaving the rest default. |
|
36 |
#' @param dir Path to a folder in which to look for \code{dict} if it is the name of a file to be passed to |
|
37 |
#' \code{\link{read.dic}}. |
|
38 |
#' @seealso For applying term-based dictionaries (to a document-term matrix) see \code{\link{lma_termcat}}. |
|
39 |
#' @family Dictionary functions |
|
40 |
#' @return A matrix with a row per \code{text} and columns per dictionary category, or (when \code{return.dtm = TRUE}) |
|
41 |
#' a sparse matrix with a row per \code{text} and column per term. Includes a \code{WC} attribute with original |
|
42 |
#' word counts, and a \code{categories} attribute with row indices associated with each category if |
|
43 |
#' \code{return.dtm = TRUE}. |
|
44 |
#' @examples |
|
45 |
#' # example text |
|
46 |
#' text <- c( |
|
47 |
#' paste( |
|
48 |
#' "Oh, what youth was! What I had and gave away.", |
|
49 |
#' "What I took and spent and saw. What I lost. And now? Ruin." |
|
50 |
#' ), |
|
51 |
#' paste( |
|
52 |
#' "God, are you so bored?! You just want what's gone from us all?", |
|
53 |
#' "I miss the you that was too. I love that you." |
|
54 |
#' ), |
|
55 |
#' paste( |
|
56 |
#' "Tomorrow! Tomorrow--nay, even tonight--you wait, as I am about to change.", |
|
57 |
#' "Soon I will off to revert. Please wait." |
|
58 |
#' ) |
|
59 |
#' ) |
|
60 |
#' |
|
61 |
#' # make a document-term matrix with pre-specified terms only |
|
62 |
#' lma_patcat(text, c("bored?!", "i lo", ". "), return.dtm = TRUE) |
|
63 |
#' |
|
64 |
#' # get counts of sets of letter |
|
65 |
#' lma_patcat(text, list(c("a", "b", "c"), c("d", "e", "f"))) |
|
66 |
#' |
|
67 |
#' # same thing with regular expressions |
|
68 |
#' lma_patcat(text, list("[abc]", "[def]"), fixed = FALSE) |
|
69 |
#' |
|
70 |
#' # match only words |
|
71 |
#' lma_patcat(text, list("i"), boundary = TRUE) |
|
72 |
#' |
|
73 |
#' # match only words, ignoring punctuation |
|
74 |
#' lma_patcat( |
|
75 |
#' text, c("you", "tomorrow", "was"), |
|
76 |
#' fixed = FALSE, |
|
77 |
#' boundary = "\\b", return.dtm = TRUE |
|
78 |
#' ) |
|
79 |
#' |
|
80 |
#' \dontrun{ |
|
81 |
#' |
|
82 |
#' # read in the temporal orientation lexicon from the World Well-Being Project |
|
83 |
#' tempori <- read.csv( |
|
84 |
#' "https://wwbp.org/downloads/public_data/temporalOrientationLexicon.csv" |
|
85 |
#' ) |
|
86 |
#' |
|
87 |
#' lma_patcat(text, tempori) |
|
88 |
#' |
|
89 |
#' # or use the standardized version |
|
90 |
#' tempori_std <- read.dic("wwbp_prospection", dir = "~/Dictionaries") |
|
91 |
#' |
|
92 |
#' lma_patcat(text, tempori_std) |
|
93 |
#' |
|
94 |
#' ## get scores on the same scale by adjusting the standardized values |
|
95 |
#' tempori_std[, -1] <- tempori_std[, -1] / 100 * |
|
96 |
#' select.dict("wwbp_prospection")$selected[, "original_max"] |
|
97 |
#' |
|
98 |
#' lma_patcat(text, tempori_std)[, unique(tempori$category)] |
|
99 |
#' } |
|
100 |
#' @export |
|
101 | ||
102 |
lma_patcat <- function(text, dict = NULL, pattern.weights = "weight", pattern.categories = "category", bias = NULL, |
|
103 |
to.lower = TRUE, return.dtm = FALSE, drop.zeros = FALSE, exclusive = TRUE, boundary = NULL, fixed = TRUE, |
|
104 |
globtoregex = FALSE, name.map = c(intname = "_intercept", term = "term"), dir = getOption("lingmatch.dict.dir")) { |
|
105 | 71x |
text_names <- names(text) |
106 | ! |
if (is.factor(text)) text <- as.character(text) |
107 | ! |
if (!is.character(text)) stop("enter a character vector as the first argument") |
108 | 71x |
text <- paste(" ", text, " ") |
109 | ! |
if (is.null(names(name.map)) && length(name.map) < 3) names(name.map) <- c("intname", "term")[seq_along(name.map)] |
110 | 71x |
wide <- FALSE |
111 | 2x |
if (missing(dict) && missing(pattern.weights) && missing(pattern.categories)) dict <- lma_dict() |
112 | 71x |
if (is.character(dict) && length(dict) == 1 && (file.exists(dict) || grepl("^[A-Za-z_]{3}", dict)) && |
113 | 71x |
missing(pattern.weights) && missing(pattern.categories)) { |
114 | 1x |
if (dir == "") dir <- "~/Dictionaries" |
115 | 1x |
if (!any(file.exists(dict)) && any(file.exists(normalizePath(paste0(dir, "/", dict), "/", FALSE)))) { |
116 | ! |
dict <- normalizePath(paste0(dir, "/", dict), "/", FALSE) |
117 |
} |
|
118 | 1x |
td <- tryCatch(read.dic(dict), error = function(e) NULL) |
119 | 1x |
dict <- if (is.null(td)) list(cat1 = dict) else td |
120 |
} |
|
121 | 71x |
if (!is.null(dim(dict))) { |
122 | 13x |
if (is.null(colnames(dict))) { |
123 | 1x |
colnames(dict) <- paste0("X", seq_len(ncol(dict))) |
124 |
} else { |
|
125 | ! |
if (!is.data.frame(dict)) dict <- as.data.frame(as.matrix(dict), stringsAsFactors = FALSE) |
126 | 12x |
terms <- if (name.map[["term"]] %in% colnames(dict)) colnames(dict) != name.map[["term"]] else !logical(ncol(dict)) |
127 | 12x |
if (missing(pattern.weights) && !any(pattern.weights %in% colnames(dict))) { |
128 | 6x |
if (any(su <- terms & vapply(dict, is.numeric, TRUE))) { |
129 | 6x |
terms <- terms & !su |
130 | 6x |
pattern.weights <- dict[, su] |
131 |
} |
|
132 |
} |
|
133 | 12x |
if (missing(pattern.categories) && !pattern.categories %in% colnames(dict)) { |
134 | 6x |
if (any(su <- terms & vapply(dict, function(v) !is.numeric(v) && anyDuplicated(v), TRUE))) { |
135 | 1x |
terms <- terms & !su |
136 | 1x |
pattern.categories <- dict[, su] |
137 | ! |
if (sum(su) > 1) pattern.categories <- do.call(paste, pattern.categories) |
138 |
} |
|
139 |
} |
|
140 | 12x |
if (name.map[["term"]] %in% colnames(dict)) { |
141 | 10x |
dict[, name.map[["term"]]] |
142 | 2x |
} else if (!all(terms)) { |
143 | 1x |
dict <- if (any(terms)) dict[, which(terms)[1]] else rownames(dict) |
144 |
} |
|
145 |
} |
|
146 |
} |
|
147 |
# independently entered wide weights |
|
148 | 71x |
if ((is.null(dict) || is.null(dim(dict))) && (!is.null(ncol(pattern.weights)) || !is.null(ncol(pattern.categories)))) { |
149 | 6x |
weights <- if (!is.null(ncol(pattern.weights))) pattern.weights else pattern.categories |
150 | 6x |
if (!is.null(rownames(weights)) && any(grepl("[^0-9]", rownames(weights)))) { |
151 | 1x |
dict <- rownames(weights) |
152 | 5x |
} else if (is.list(dict) && (length(dict) == 1 || |
153 | 5x |
(length(dict[[1]]) == nrow(weights) && all(vapply(dict, length, 0) == nrow(weights))))) { |
154 | 1x |
dict <- dict[[1]] |
155 |
} |
|
156 | ! |
if (length(dict) != nrow(weights)) stop("dict and wide weights do not align") |
157 | 6x |
wide <- TRUE |
158 | 6x |
if (!missing(pattern.categories) && is.character(pattern.categories) && any(su <- pattern.categories %in% weights)) { |
159 | ! |
weights <- weights[, pattern.categories[su], drop = FALSE] |
160 |
} |
|
161 | 6x |
weights <- weights[, vapply(seq_len(ncol(weights)), function(col) is.numeric(weights[, col]), TRUE), drop = FALSE] |
162 | ! |
if (!ncol(weights)) stop("could not identify numeric weights in wide weights") |
163 | 6x |
lex <- list(terms = dict, weights = weights, category = colnames(weights)) |
164 |
# wide weights in dict |
|
165 | 65x |
} else if (!is.null(dim(dict)) && ( |
166 | 65x |
(length(pattern.weights) > 1 && is.character(pattern.weights)) || |
167 | 65x |
(length(pattern.categories) > 1 && |
168 | 65x |
(length(pattern.categories) != nrow(dict) || all(pattern.categories %in% colnames(dict)))) || |
169 | 65x |
(!any(pattern.weights %in% colnames(dict)) && !any(pattern.categories %in% colnames(dict))) |
170 |
)) { |
|
171 | 8x |
if (any(su <- pattern.weights %in% colnames(dict))) { |
172 | 1x |
categories <- pattern.weights[su] |
173 | 7x |
} else if (any(su <- pattern.categories %in% colnames(dict))) { |
174 | 1x |
categories <- pattern.categories |
175 | 6x |
} else if (any(su <- vapply(colnames(dict), function(v) is.numeric(dict[, v]), TRUE))) { |
176 | 6x |
categories <- colnames(dict)[su] |
177 |
} else { |
|
178 | ! |
stop("could not find weights in dict column names") |
179 |
} |
|
180 | 8x |
wide <- TRUE |
181 | 8x |
if (!name.map[["term"]] %in% colnames(dict)) { |
182 | 1x |
terms <- colnames(dict)[vapply(colnames(dict), function(v) !is.numeric(dict[, v]), TRUE)] |
183 | ! |
if (!length(terms)) stop("could not find terms in dict") |
184 | 1x |
name.map[["term"]] <- if (length(terms) > 1) { |
185 | ! |
su <- vapply(terms, function(v) !anyDuplicated(dict[, v]), TRUE) |
186 | ! |
if (any(su)) terms[which(su)[1]] else terms[1] |
187 |
} else { |
|
188 | 1x |
terms |
189 |
} |
|
190 |
} |
|
191 | 8x |
lex <- list(term = dict[, name.map[["term"]]], weights = dict[, categories, drop = FALSE], category = categories) |
192 |
# independently entered weights and categories |
|
193 | 57x |
} else if (is.null(dim(dict))) { |
194 | 53x |
if (is.null(dict) || (is.numeric(dict) && is.null(names(dict))) || (is.list(dict) && is.numeric(dict[[1]]) && |
195 | 53x |
is.null(names(dict[[1]])))) { |
196 | ! |
stop("could not recognize terms in dict") |
197 |
} |
|
198 | 53x |
n <- length(dict) |
199 | 53x |
lex <- data.frame( |
200 | 53x |
term = if (is.character(dict)) { |
201 | 13x |
dict |
202 | 53x |
} else if (is.numeric(dict)) { |
203 | ! |
names(dict) |
204 | 53x |
} else if (is.list(dict) && |
205 | 53x |
is.numeric(dict[[1]])) { |
206 | 4x |
unlist(lapply(dict, names), use.names = FALSE) |
207 |
} else { |
|
208 | 36x |
unlist(dict, use.names = FALSE) |
209 |
}, |
|
210 | 53x |
category = if (length(pattern.categories) == n) { |
211 | 23x |
if (is.list(dict) && !is.null(names(dict))) { |
212 | 3x |
names(dict) |
213 |
} else { |
|
214 | 20x |
pattern.categories |
215 |
} |
|
216 | 53x |
} else if (is.list(dict)) { |
217 | 21x |
rep(if (!is.null(names(dict))) { |
218 | 21x |
names(dict) |
219 |
} else { |
|
220 | ! |
paste0("cat", seq_along(dict)) |
221 | 21x |
}, vapply(dict, length, 0)) |
222 |
} else { |
|
223 | 9x |
"cat1" |
224 |
}, |
|
225 | 53x |
weights = if (is.numeric(dict)) { |
226 | ! |
unname(dict) |
227 | 53x |
} else if (is.numeric(pattern.weights)) { |
228 | 2x |
if (!is.null(names(pattern.weights)) && is.character(dict) && all(dict %in% names(pattern.weights))) { |
229 | ! |
pattern.weights[dict] |
230 |
} else { |
|
231 | 2x |
pattern.weights |
232 |
} |
|
233 | 53x |
} else if (is.list(dict)) { |
234 | 40x |
if (is.numeric(dict[[1]])) { |
235 | 4x |
unlist(dict, use.names = FALSE) |
236 | 36x |
} else if (is.list(pattern.weights) && is.numeric(pattern.weights[[1]])) { |
237 | 2x |
unlist(pattern.weights, use.names = FALSE) |
238 |
} else { |
|
239 | 34x |
1 |
240 |
} |
|
241 |
} else { |
|
242 | 11x |
1 |
243 | 53x |
}, stringsAsFactors = FALSE |
244 |
) |
|
245 |
} else { |
|
246 | 4x |
term <- if ("term" %in% names(name.map)) name.map[["term"]] else "term" |
247 | 4x |
en <- colnames(dict) |
248 | 4x |
if (!term %in% en) { |
249 | 1x |
su <- vapply(en, function(v) !is.numeric(dict[, v]), TRUE) |
250 | 1x |
if (any(su)) { |
251 | 1x |
term <- en[which(su)[1]] |
252 | 1x |
if (sum(su) > 1) { |
253 | 1x |
su <- su & vapply(en, function(v) !anyDuplicated(dict[, v]), TRUE) |
254 | ! |
if (any(su)) term <- en[which(su)[1]] |
255 |
} |
|
256 |
} else { |
|
257 | ! |
stop("could not recognize terms in dict") |
258 |
} |
|
259 |
} |
|
260 | 4x |
lex <- data.frame( |
261 | 4x |
term = dict[[term]], |
262 | 4x |
category = if (length(pattern.categories) == nrow(dict)) { |
263 | ! |
pattern.categories |
264 | 4x |
} else if (pattern.categories %in% en) dict[[pattern.categories]] else "cat1", |
265 | 4x |
weights = if (length(pattern.weights) == nrow(dict)) { |
266 | ! |
pattern.weights |
267 | 4x |
} else if (all(pattern.weights %in% en)) dict[[pattern.weights]] else 1, stringsAsFactors = FALSE |
268 |
) |
|
269 |
} |
|
270 | 2x |
if (any(lex$category == "")) lex[lex$category == "", "category"] <- "cat_unnamed" |
271 | ! |
if (is.factor(lex$term)) lex$term <- as.character(lex$term) |
272 | 71x |
if (globtoregex || !fixed) { |
273 | 17x |
lex$term <- to_regex(list(lex$term), TRUE, globtoregex)[[1]] |
274 | 4x |
if (missing(fixed)) fixed <- FALSE |
275 |
} |
|
276 | 71x |
if (wide && return.dtm) { |
277 | 1x |
wide <- FALSE |
278 | 1x |
lex <- data.frame(term = lex$term, category = if (length(lex$category) == 1) lex$category else "all") |
279 |
} |
|
280 | 71x |
if (!return.dtm && is.null(bias)) { |
281 | ! |
if (!"intname" %in% names(name.map)) name.map[["intname"]] <- "_intercept" |
282 | 51x |
if (any(su <- lex$term == name.map[["intname"]])) { |
283 | 6x |
if (wide) { |
284 | 1x |
bias <- structure(lex$weights[su, ], names = lex$categories[su]) |
285 | 1x |
lex$term <- lex$term[!su] |
286 | 1x |
lex$weights <- lex$weights[!su, , drop = FALSE] |
287 |
} else { |
|
288 | 5x |
bias <- structure(lex[su, "weights"], names = lex[su, "category"]) |
289 | 5x |
lex <- lex[!su, ] |
290 |
} |
|
291 |
} |
|
292 |
} |
|
293 | 71x |
if (exclusive) { |
294 | 62x |
cls <- tryCatch(-nchar(lex$term), error = function(e) NULL) |
295 | 62x |
if (is.null(cls)) { |
296 | ! |
warning( |
297 | ! |
"dict appears to be misencoded, so results may not be as expected;\n", |
298 | ! |
'might try reading the dictionary in with encoding = "latin1"' |
299 |
) |
|
300 | ! |
lex$term <- iconv(lex$term, sub = "#") |
301 | ! |
cls <- -nchar(lex$term) |
302 |
} |
|
303 | 62x |
if (wide) { |
304 | 13x |
o <- order(cls) |
305 | 13x |
lex$term <- lex$term[o] |
306 | 13x |
lex$weights <- lex$weights[o, ] |
307 |
} else { |
|
308 | 49x |
lex <- lex[order(cls), ] |
309 |
} |
|
310 |
} |
|
311 | 71x |
lex$category <- factor(lex$category, unique(lex$category)) |
312 | 71x |
categories <- levels(lex$category) |
313 | 71x |
if (length(bias)) { |
314 | 2x |
if (is.null(names(bias)) && length(bias) == length(categories)) names(bias) <- categories |
315 | ! |
if (any(su <- !categories %in% names(bias))) bias[categories[su]] <- 0 |
316 |
} else { |
|
317 | 61x |
bias <- structure(integer(length(categories)), names = categories) |
318 |
} |
|
319 | 71x |
bias <- bias[categories] |
320 | 8x |
if (is.logical(boundary) && boundary) boundary <- " " |
321 | 71x |
if (missing(to.lower)) { |
322 | 67x |
if (any(grepl("[A-Z]", lex$term))) { |
323 | 2x |
to.lower <- FALSE |
324 | 1x |
if (!any(grepl("[a-z]", lex$term))) text <- toupper(text) |
325 |
} |
|
326 |
} |
|
327 | 65x |
if (to.lower) text <- tolower(text) |
328 | 71x |
st <- proc.time()[[3]] |
329 | 71x |
terms <- unique(lex$term) |
330 | 71x |
if (!fixed) { |
331 | 17x |
ck <- tryCatch( |
332 | 17x |
suppressWarnings(grepl(paste0("(?:", paste(terms, collapse = "|"), ")"), "", perl = TRUE)), |
333 | 17x |
error = function(e) NULL |
334 |
) |
|
335 | 1x |
if (is.null(ck)) stop("terms contain invalid regular expressions", call. = FALSE) |
336 |
} |
|
337 | 70x |
op <- pattern_search( |
338 | 70x |
text, if (is.character(boundary)) paste0(boundary, terms, boundary) else terms, |
339 | 70x |
seq_along(terms) - 1L, fixed, exclusive |
340 |
) |
|
341 | 70x |
colnames(op[[1]]) <- terms |
342 | 70x |
if (return.dtm) { |
343 | 16x |
attr(op[[1]], "categories") <- lapply(categories, function(cat) { |
344 | 63x |
which(colnames(op[[1]]) %in% lex[lex$category == cat, "term"]) |
345 |
}) |
|
346 | 16x |
names(attr(op[[1]], "categories")) <- categories |
347 |
} else { |
|
348 | 54x |
op[[1]] <- vapply(categories, function(cat) { |
349 | 137x |
l <- if (wide) { |
350 | 26x |
data.frame(term = lex$term, weights = if (cat %in% colnames(lex$weights)) { |
351 | 26x |
lex$weights[, cat] |
352 |
} else { |
|
353 | ! |
lex$weights |
354 | 26x |
}, stringsAsFactors = FALSE) |
355 |
} else { |
|
356 | 111x |
lex[lex$category == cat, ] |
357 |
} |
|
358 | 137x |
as.numeric(op[[1]][, l$term, drop = FALSE] %*% l$weights + bias[[cat]]) |
359 | 54x |
}, numeric(length(text))) |
360 | 54x |
if (length(text) == 1) { |
361 | 9x |
op[[1]] <- t(op[[1]]) |
362 | 9x |
rownames(op[[1]]) <- 1 |
363 |
} |
|
364 |
} |
|
365 | ! |
if (length(text_names) == nrow(op[[1]])) rownames(op[[1]]) <- text_names |
366 | 70x |
attr(op[[1]], "WC") <- op[[2]] |
367 | 70x |
attr(op[[1]], "time") <- c(patcat = proc.time()[[3]] - st) |
368 | 1x |
if (drop.zeros) op[[1]] <- op[[1]][, colSums(op[[1]]) != 0, drop = FALSE] |
369 | 70x |
op[[1]] |
370 |
} |
1 |
#' Similarity Calculations |
|
2 |
#' |
|
3 |
#' Enter a numerical matrix, set of vectors, or set of matrices to calculate similarity per vector. |
|
4 |
#' |
|
5 |
#' @param a A vector or matrix. If a vector, \code{b} must also be provided. If a matrix and \code{b} |
|
6 |
#' is missing, each row will be compared. If a matrix and \code{b} is not missing, each row will |
|
7 |
#' be compared with \code{b} or each row of \code{b}. |
|
8 |
#' @param b A vector or matrix to be compared with \code{a} or rows of \code{a}. |
|
9 |
#' @param metric A character or vector of characters at least partially matching one of the |
|
10 |
#' available metric names (or 'all' to explicitly include all metrics), |
|
11 |
#' or a number or vector of numbers indicating the metric by index: |
|
12 |
#' \itemize{ |
|
13 |
#' \item \strong{\code{jaccard}}: \code{sum(a & b) / sum(a | b)} |
|
14 |
#' \item \strong{\code{euclidean}}: \code{1 / (1 + sqrt(sum((a - b) ^ 2)))} |
|
15 |
#' \item \strong{\code{canberra}}: \code{mean(1 - abs(a - b) / (a + b))} |
|
16 |
#' \item \strong{\code{cosine}}: \code{sum(a * b) / sqrt(sum(a ^ 2 * sum(b ^ 2)))} |
|
17 |
#' \item \strong{\code{pearson}}: \code{(mean(a * b) - (mean(a) * mean(b))) /} \cr |
|
18 |
#' \code{sqrt(mean(a ^ 2) - mean(a) ^ 2) / sqrt(mean(b ^ 2) - mean(b) ^ 2)} |
|
19 |
#' } |
|
20 |
#' @param group If \code{b} is missing and \code{a} has multiple rows, this will be used to make |
|
21 |
#' comparisons between rows of \code{a}, as modified by \code{agg} and \code{agg.mean}. |
|
22 |
#' @param lag Amount to adjust the \code{b} index; either rows if \code{b} has multiple rows (e.g., |
|
23 |
#' for \code{lag = 1}, \code{a[1, ]} is compared with \code{b[2, ]}), or values otherwise (e.g., |
|
24 |
#' for \code{lag = 1}, \code{a[1]} is compared with \code{b[2]}). If \code{b} is not supplied, |
|
25 |
#' \code{b} is a copy of \code{a}, resulting in lagged self-comparisons or autocorrelations. |
|
26 |
#' @param agg Logical: if \code{FALSE}, only the boundary rows between groups will be compared, see |
|
27 |
#' example. |
|
28 |
#' @param agg.mean Logical: if \code{FALSE} aggregated rows are summed instead of averaged. |
|
29 |
#' @param pairwise Logical: if \code{FALSE} and \code{a} and \code{b} are matrices with the same number of |
|
30 |
#' rows, only paired rows are compared. Otherwise (and if only \code{a} is supplied), all pairwise |
|
31 |
#' comparisons are made. |
|
32 |
#' @param symmetrical Logical: if \code{TRUE} and pairwise comparisons between \code{a} rows were made, |
|
33 |
#' the results in the lower triangle are copied to the upper triangle. |
|
34 |
#' @param mean Logical: if \code{TRUE}, a single mean for each metric is returned per row of \code{a}. |
|
35 |
#' @param return.list Logical: if \code{TRUE}, a list-like object will always be returned, with an entry |
|
36 |
#' for each metric, even when only one metric is requested. |
|
37 |
#' @details |
|
38 |
#' Use \code{\link[RcppParallel]{setThreadOptions}} to change parallelization options; e.g., run |
|
39 |
#' RcppParallel::setThreadOptions(4) before a call to lma_simets to set the number of CPU |
|
40 |
#' threads to 4. |
|
41 |
#' @return Output varies based on the dimensions of \code{a} and \code{b}: |
|
42 |
#' \itemize{ |
|
43 |
#' \item \strong{Out:} A vector with a value per metric. \cr |
|
44 |
#' \strong{In:} Only when \code{a} and \code{b} are both vectors. |
|
45 |
#' \item \strong{Out:} A vector with a value per row. \cr |
|
46 |
#' \strong{In:} Any time a single value is expected per row: \code{a} or \code{b} is a vector, |
|
47 |
#' \code{a} and \code{b} are matrices with the same number of rows and \code{pairwise = FALSE}, a group is |
|
48 |
#' specified, or \code{mean = TRUE}, and only one metric is requested. |
|
49 |
#' \item \strong{Out:} A data.frame with a column per metric. \cr |
|
50 |
#' \strong{In:} When multiple metrics are requested in the previous case. |
|
51 |
#' \item \strong{Out:} A sparse matrix with a \code{metric} attribute with the metric name. \cr |
|
52 |
#' \strong{In:} Pairwise comparisons within an \code{a} matrix or between |
|
53 |
#' an \code{a} and \code{b} matrix, when only 1 metric is requested. |
|
54 |
#' \item \strong{Out:} A list with a sparse matrix per metric. \cr |
|
55 |
#' \strong{In:} When multiple metrics are requested in the previous case. |
|
56 |
#' } |
|
57 |
#' @examples |
|
58 |
#' text <- c( |
|
59 |
#' "words of speaker A", "more words from speaker A", |
|
60 |
#' "words from speaker B", "more words from speaker B" |
|
61 |
#' ) |
|
62 |
#' (dtm <- lma_dtm(text)) |
|
63 |
#' |
|
64 |
#' # compare each entry |
|
65 |
#' lma_simets(dtm) |
|
66 |
#' |
|
67 |
#' # compare each entry with the mean of all entries |
|
68 |
#' lma_simets(dtm, colMeans(dtm)) |
|
69 |
#' |
|
70 |
#' # compare by group (corresponding to speakers and turns in this case) |
|
71 |
#' speaker <- c("A", "A", "B", "B") |
|
72 |
#' |
|
73 |
#' ## by default, consecutive rows from the same group are averaged: |
|
74 |
#' lma_simets(dtm, group = speaker) |
|
75 |
#' |
|
76 |
#' ## with agg = FALSE, only the rows at the boundary between |
|
77 |
#' ## groups (rows 2 and 3 in this case) are used: |
|
78 |
#' lma_simets(dtm, group = speaker, agg = FALSE) |
|
79 |
#' @export |
|
80 | ||
81 |
lma_simets <- function(a, b = NULL, metric = NULL, group = NULL, lag = 0, agg = TRUE, agg.mean = TRUE, |
|
82 |
pairwise = TRUE, symmetrical = FALSE, mean = FALSE, return.list = FALSE) { |
|
83 | 403x |
cf <- NULL |
84 | 403x |
mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson") |
85 | 403x |
if (missing(metric) && length(b) == 1 && !grepl(" ", b) && |
86 | 403x |
any(grepl(tolower(substr(b, 1, 3)), mets, fixed = TRUE))) { |
87 | 8x |
metric <- b |
88 | 8x |
b <- NULL |
89 |
} |
|
90 | 403x |
met <- match_metric(metric) |
91 | 403x |
if (!length(met$selected)) { |
92 | ! |
stop( |
93 | ! |
"no recognized metric; should match one of ", |
94 | ! |
paste0(mets, collapse = ", "), ", or all" |
95 |
) |
|
96 |
} |
|
97 | 403x |
st <- proc.time()[[3]] |
98 | 403x |
slots <- c("i", "p", "x", "Dim") |
99 | 403x |
if ((is.character(a) || is.factor(a)) && any(grepl("[a-zA-Z]", a))) { |
100 | 2x |
a <- lma_dtm(a) |
101 | 74x |
} else if (is.data.frame(a)) a <- Matrix(as.matrix(a), sparse = TRUE) |
102 | 1x |
if (is.null(b) && !missing(lag) && is.null(dim(a))) b <- a |
103 | 403x |
if (is.null(b)) { |
104 | 88x |
n <- dim(a)[1] |
105 | ! |
if (is.null(n) || n < 2) stop("a must have more than 1 row when b is not provided", call. = FALSE) |
106 | 88x |
if (is.null(group)) { |
107 | 47x |
if (!all(slots %in% slotNames(a))) a <- as(a, "CsparseMatrix") |
108 | 61x |
res <- calculate_similarities(a, NULL, 2, met$dummy) |
109 | 61x |
for (i in seq_along(res)) attr(res[[i]], "metric") <- met$selected[i] |
110 |
} else { |
|
111 | ! |
if (length(group) != n) stop("group is not the same length as a or columns in a") |
112 | 27x |
ager <- if (agg.mean) colMeans else colSums |
113 | 27x |
l <- length(group) |
114 | 27x |
chunks <- NULL |
115 | 27x |
i <- 1 |
116 | 27x |
while (i < l) { |
117 | 130x |
st <- i |
118 | 130x |
g <- group[i] |
119 | 130x |
while (i < l && g == group[i + 1]) i <- i + 1 |
120 | 130x |
chunks <- c(chunks, list(seq(st, i))) |
121 | 130x |
i <- i + 1 |
122 |
} |
|
123 | 15x |
if (!any(chunks[[length(chunks)]] == l)) chunks <- c(chunks, list(l)) |
124 | 27x |
rows <- character(length(chunks) - 1) |
125 | 27x |
res <- as.data.frame(matrix(0, length(chunks) - 1, sum(met$dummy), dimnames = list(NULL, met$selected))) |
126 | 27x |
for (i in seq_len(length(chunks) - 1)) { |
127 | 118x |
s <- chunks[[i]] |
128 | 118x |
sa <- if (agg) s else s[length(s)] |
129 | 118x |
ta <- ager(a[sa, , drop = FALSE]) |
130 | 118x |
s <- chunks[[i + 1]] |
131 | 118x |
sb <- if (agg) s else s[1] |
132 | 118x |
tb <- ager(a[sb, , drop = FALSE]) |
133 | 118x |
res[i, ] <- vector_similarity(ta, tb, met$dummy) |
134 | 118x |
rows[i] <- paste(paste(sa, collapse = ", "), "<->", paste(sb, collapse = ", ")) |
135 |
} |
|
136 | 27x |
rownames(res) <- rows |
137 |
} |
|
138 |
} else { |
|
139 | 315x |
if ((is.character(b) || is.factor(b)) && any(grepl("[a-zA-Z]", b))) { |
140 | 1x |
b <- lma_dtm(b) |
141 | 10x |
} else if (is.data.frame(b)) b <- Matrix(as.matrix(b), sparse = TRUE) |
142 | 315x |
bn <- if (is.null(dim(b))) length(b) else dim(b)[1] |
143 | 4x |
if (lag && abs(lag) >= bn) lag <- if (lag < 0) -bn + 1 else bn - 1 |
144 | 315x |
res <- if (is.null(dim(b)) && length(a) == bn && (is.null(dim(a)) || any(dim(a) == 1))) { |
145 | 193x |
b <- as.numeric(b) |
146 | 5x |
if (lag) b <- if (lag < 0) c(b[-seq_len(-lag)], numeric(-lag)) else c(numeric(lag), b)[seq_len(bn)] |
147 | 193x |
vector_similarity(as.numeric(a), b, met$dummy) |
148 |
} else { |
|
149 | 2x |
if (is.null(dim(a))) a <- Matrix(a, 1, dimnames = list(NULL, names(a)), sparse = TRUE) |
150 | 61x |
if (!all(slots %in% slotNames(a))) a <- as(a, "CsparseMatrix") |
151 | 57x |
if (is.null(dim(b))) b <- Matrix(b, 1, dimnames = list(NULL, names(b)), sparse = TRUE) |
152 | 19x |
if (!all(slots %in% slotNames(b))) b <- as(b, "CsparseMatrix") |
153 | 122x |
d <- c(dim(a), dim(b)) |
154 | 122x |
if (d[2] != d[4]) { |
155 | 2x |
ns <- colnames(a) |
156 | 2x |
if (!is.null(ns)) { |
157 | 2x |
ns <- ns[ns %in% colnames(b)] |
158 | 2x |
if (length(ns)) { |
159 | 2x |
a <- a[, ns, drop = FALSE] |
160 | 2x |
b <- b[, ns, drop = FALSE] |
161 |
} |
|
162 |
} |
|
163 | 2x |
d <- c(dim(a), dim(b)) |
164 | 2x |
if (d[2] != d[4]) { |
165 | ! |
stop("a and b have a different number of columns, which could not be aligned by name") |
166 |
} |
|
167 |
} |
|
168 | 122x |
if (lag) { |
169 | 4x |
b <- if (lag > 0) { |
170 | 2x |
rbind(Matrix(0, lag, d[4], sparse = TRUE), b[-(seq_len(lag) + d[3] - lag), ]) |
171 |
} else { |
|
172 | 2x |
rbind(b[-seq_len(-lag), ], Matrix(0, -lag, d[4], sparse = TRUE)) |
173 |
} |
|
174 |
} |
|
175 | 122x |
type <- if (((missing(pairwise) || !pairwise) && d[1] == d[3]) || |
176 | 122x |
d[3] == 1) { |
177 | 79x |
1 |
178 |
} else { |
|
179 | 43x |
3 |
180 |
} |
|
181 | 122x |
calculate_similarities(a, b, type, met$dummy) |
182 |
} |
|
183 |
} |
|
184 | 403x |
if ("list" %in% class(res) && length(res)) { |
185 | 183x |
pairwise <- "dtCMatrix" %in% class(res[[1]]) |
186 | 183x |
if ((pairwise && symmetrical) || mean) { |
187 | 17x |
for (i in seq_along(res)) { |
188 | 16x |
if (pairwise && (symmetrical || mean)) res[[i]] <- forceSymmetric(res[[i]], "L") |
189 | 17x |
if (mean) { |
190 | 3x |
res[[i]] <- if (is.null(dim(res[[i]]))) { |
191 | ! |
mean(res[[i]], na.rm = TRUE) |
192 |
} else { |
|
193 | 3x |
(rowSums(res[[i]], TRUE) - 1) / (ncol(res[[i]]) - 1) |
194 |
} |
|
195 |
} |
|
196 |
} |
|
197 |
} |
|
198 | 183x |
if (is.null(dim(res[[1]]))) { |
199 | 85x |
rn <- if (!is.na(nd <- which(c(dim(a), dim(b)) == length(res[[1]]))[1]) && !is.null(rownames(if (nd == 1) a else b))) { |
200 | 14x |
rownames(if (nd == 1) a else b) |
201 |
} else { |
|
202 | 71x |
NULL |
203 |
} |
|
204 | 85x |
if (length(met$selected) == 1) { |
205 | 14x |
if (length(rn) == length(res[[1]])) names(res[[1]]) <- rn |
206 |
} else { |
|
207 | 17x |
attr(res, "row.names") <- if (length(rn) == length(res[[1]])) rn else seq_along(res[[1]]) |
208 | 17x |
attr(res, "class") <- "data.frame" |
209 |
} |
|
210 |
} |
|
211 | 162x |
if (!return.list && length(met$selected) == 1) res <- res[[1]] |
212 |
} |
|
213 | 403x |
attr(res, "time") <- c(simets = proc.time()[[3]] - st) |
214 | 403x |
res |
215 |
} |
1 |
#' Linguistic Matching and Accommodation |
|
2 |
#' |
|
3 |
#' Offers a variety of methods to assess linguistic matching or accommodation, where \emph{matching} |
|
4 |
#' is general similarity (sometimes called \emph{homophily}), and \emph{accommodation} is some form |
|
5 |
#' of conditional similarity (accounting for some base-rate or precedent; sometimes called |
|
6 |
#' \emph{alignment}). |
|
7 |
#' |
|
8 |
#' There are a great many points of decision in the assessment of linguistic similarity and/or |
|
9 |
#' accommodation, partly inherited from the great many point of decision inherent in the numerical |
|
10 |
#' representation of language. Two general types of matching are implemented here as sets of |
|
11 |
#' defaults: Language/Linguistic Style Matching (LSM; Niederhoffer & Pennebaker, 2002; Ireland & |
|
12 |
#' Pennebaker, 2010), and Latent Semantic Analysis/Similarity (LSA; Landauer & Dumais, 1997; |
|
13 |
#' Babcock, Ta, & Ickes, 2014). See the \code{type} argument for specifics. |
|
14 |
#' |
|
15 |
#' @param input Texts to be compared; a vector, document-term matrix (dtm; with terms as column names), |
|
16 |
#' or path to a file (.txt or .csv, with texts separated by one or more lines/rows). |
|
17 |
#' @param comp Defines the comparison to be made: |
|
18 |
#' \itemize{ |
|
19 |
#' \item If a \strong{function}, this will be applied to \code{input} within each group (overall if there is |
|
20 |
#' no group; i.e., \code{apply(input, 2, comp)}; e.g., \code{comp = mean} would compare each text to |
|
21 |
#' the mean profile of its group). |
|
22 |
#' \item If a \strong{character} with a length of 1 and no spaces: |
|
23 |
#' \itemize{ |
|
24 |
#' \item If it partially matches one of \code{lsm_profiles}'s rownames, that row will be used as the comparison. |
|
25 |
#' \item If it partially matches \code{'auto'}, the highest correlating \code{lsm_profiles} row will be used. |
|
26 |
#' \item If it partially matches \code{'pairwise'}, each text will be compared to one another. |
|
27 |
#' \item If it partially matches \code{'sequential'}, the last variable in \code{group} will be treated as |
|
28 |
#' a speaker ID (see the Grouping and Comparisons section). |
|
29 |
#' } |
|
30 |
#' \item If a \strong{character vector}, this will be processed in the same way as \code{input}. |
|
31 |
#' \item If a \strong{vector}, either (a) logical or factor-like (having n levels < length) and of the same length as |
|
32 |
#' \code{nrow(input)}, or (b) numeric or logical of length less than \code{nrow(input)}, this will be used to |
|
33 |
#' select a subset of \code{input} (e.g., \code{comp = 1:10} would treat the first 10 rows of \code{input} as the |
|
34 |
#' comparison; \code{comp = type == 'prompt'} would make a logical vector identifying prompts, assuming "type" was |
|
35 |
#' the name of a column in \code{data}, or a variable in the global environment, and the value "prompt" marked the |
|
36 |
#' prompts). |
|
37 |
#' \item If a \strong{matrix-like object} (having multiple rows and columns), or a named vector, this will |
|
38 |
#' be treated as a sort of dtm, assuming there are common (column) names between \code{input} and |
|
39 |
#' \code{comp} (e.g., if you had prompt and response texts that were already processed separately). |
|
40 |
#' } |
|
41 |
#' @param data A matrix-like object as a reference for column names, if variables are referred to in |
|
42 |
#' other arguments (e.g., \code{lingmatch(text, data = data)} would be the same as |
|
43 |
#' \code{lingmatch(data$text)}. |
|
44 |
#' @param group A logical or factor-like vector the same length as \code{NROW(input)}, used to defined |
|
45 |
#' groups. |
|
46 |
#' @param ... Passes arguments to \code{\link{lma_dtm}}, \code{\link{lma_weight}}, |
|
47 |
#' \code{\link{lma_termcat}}, and/or \code{\link{lma_lspace}} (depending on \code{input} and \code{comp}), |
|
48 |
#' and \code{\link{lma_simets}}. |
|
49 |
#' @param comp.data A matrix-like object as a source for \code{comp} variables. |
|
50 |
#' @param comp.group The column name of the grouping variable(s) in \code{comp.data}; if |
|
51 |
#' \code{group} contains references to column names, and \code{comp.group} is not specified, |
|
52 |
#' \code{group} variables will be looked for in \code{comp.data}. |
|
53 |
#' @param order A numeric vector the same length as \code{nrow(input)} indicating the order of the |
|
54 |
#' texts and grouping variables when the type of comparison is sequential. Only necessary if the |
|
55 |
#' texts are not already ordered as desired. |
|
56 |
#' @param drop logical; if \code{TRUE}, will drop columns with a sum of 0. |
|
57 |
#' @param all.levels logical; if \code{FALSE}, multiple groups are combined. See the Grouping and |
|
58 |
#' Comparisons section. |
|
59 |
#' @param type A character at least partially matching 'lsm' or 'lsa'; applies default settings |
|
60 |
#' aligning with the standard calculations of each type: |
|
61 |
#' \tabular{ll}{ |
|
62 |
#' LSM \tab \code{lingmatch(text, weight = 'freq', dict = lma_dict(1:9), metric = 'canberra')}\cr |
|
63 |
#' LSA \tab \code{lingmatch(text, weight = 'tfidf', space = '100k_lsa', metric = 'cosine')}\cr |
|
64 |
#' } |
|
65 |
#' @section Grouping and Comparisons: |
|
66 |
#' Defining groups and comparisons can sometimes be a bit complicated, and requires dataset |
|
67 |
#' specific knowledge, so it can't always (readily) be done automatically. Variables entered in the |
|
68 |
#' \code{group} argument are treated differently depending on their position and other arguments: |
|
69 |
#' |
|
70 |
#' \describe{ |
|
71 |
#' \item{Splitting}{By default, groups are treated as if they define separate chunks of data in |
|
72 |
#' which comparisons should be calculated. Functions used to calculated comparisons, and |
|
73 |
#' pairwise comparisons are performed separately in each of these groups. For example, if you |
|
74 |
#' wanted to compare each text with the mean of all texts in its condition, a \code{group} |
|
75 |
#' variable could identify and split by condition. Given multiple grouping variables, |
|
76 |
#' calculations will either be done in each split (if \code{all.levels = TRUE}; applied in |
|
77 |
#' sequence so that groups become smaller and smaller), or once after all splits are made (if |
|
78 |
#' \code{all.levels = FALSE}). This makes for 'one to many' comparisons with either calculated |
|
79 |
#' or preexisting standards (i.e., the profile of the current data, or a precalculated profile, |
|
80 |
#' respectively).} |
|
81 |
#' \item{Comparison ID}{When comparison data is identified in \code{comp}, groups are assumed |
|
82 |
#' to apply to both \code{input} and \code{comp} (either both in \code{data}, or separately |
|
83 |
#' between \code{data} and \code{comp.data}, in which case \code{comp.group} may be needed if |
|
84 |
#' the same grouping variable have different names between \code{data} and \code{comp.data}). |
|
85 |
#' In this case, multiple grouping variables are combined into a single factor assumed to |
|
86 |
#' uniquely identify a comparison. This makes for 'one to many' comparisons with specific texts |
|
87 |
#' (as in the case of manipulated prompts or text-based conditions).} |
|
88 |
#' \item{Speaker ID}{If \code{comp} matches \code{'sequential'}, the last grouping variable |
|
89 |
#' entered is assumed to identify something like speakers (i.e., a factor with two or more |
|
90 |
#' levels and multiple observations per level). In this case, the data are assumed to be ordered |
|
91 |
#' (or ordered once sorted by \code{order} if specified). Any additional grouping variables |
|
92 |
#' before the last are treated as splitting groups. This can set up for probabilistic |
|
93 |
#' accommodation metrics. At the moment, when sequential comparisons are made within groups, |
|
94 |
#' similarity scores between speakers are averaged, resulting in mean matching between speakers |
|
95 |
#' within the group.} |
|
96 |
#' } |
|
97 |
#' @references |
|
98 |
#' Babcock, M. J., Ta, V. P., & Ickes, W. (2014). Latent semantic similarity and language style |
|
99 |
#' matching in initial dyadic interactions. \emph{Journal of Language and Social Psychology, 33}, |
|
100 |
#' 78-88. |
|
101 |
#' |
|
102 |
#' Ireland, M. E., & Pennebaker, J. W. (2010). Language style matching in writing: synchrony in |
|
103 |
#' essays, correspondence, and poetry. \emph{Journal of Personality and Social Psychology, 99}, |
|
104 |
#' 549. |
|
105 |
#' |
|
106 |
#' Landauer, T. K., & Dumais, S. T. (1997). A solution to Plato's problem: The latent semantic |
|
107 |
#' analysis theory of acquisition, induction, and representation of knowledge. |
|
108 |
#' \emph{Psychological Review, 104}, 211. |
|
109 |
#' |
|
110 |
#' Niederhoffer, K. G., & Pennebaker, J. W. (2002). Linguistic style matching in social interaction. |
|
111 |
#' \emph{Journal of Language and Social Psychology, 21}, 337-360. |
|
112 |
#' @seealso For a general text processing function, see \code{\link{lma_process}}. |
|
113 |
#' @return A list with processed components of the input, information about the comparison, and results of |
|
114 |
#' the comparison: |
|
115 |
#' \itemize{ |
|
116 |
#' \item \strong{\code{dtm}}: A sparse matrix; the raw count-dtm, or a version of the original input |
|
117 |
#' if it is more processed. |
|
118 |
#' \item \strong{\code{processed}}: A matrix-like object; a processed version of the input |
|
119 |
#' (e.g., weighted and categorized). |
|
120 |
#' \item \strong{\code{comp.type}}: A string describing the comparison if applicable. |
|
121 |
#' \item \strong{\code{comp}}: A vector or matrix-like object; the comparison data if applicable. |
|
122 |
#' \item \strong{\code{group}}: A string describing the group if applicable. |
|
123 |
#' \item \strong{\code{sim}}: Result of \code{\link{lma_simets}}. |
|
124 |
#' } |
|
125 |
#' @examples |
|
126 |
#' # compare single strings |
|
127 |
#' lingmatch("Compare this sentence.", "With this other sentence.") |
|
128 |
#' |
|
129 |
#' # compare each entry in a character vector with... |
|
130 |
#' texts <- c( |
|
131 |
#' "One bit of text as an entry...", |
|
132 |
#' "Maybe multiple sentences in an entry. Maybe essays or posts or a book.", |
|
133 |
#' "Could be lines or a column from a read-in file..." |
|
134 |
#' ) |
|
135 |
#' |
|
136 |
#' ## one another |
|
137 |
#' lingmatch(texts) |
|
138 |
#' |
|
139 |
#' ## the first |
|
140 |
#' lingmatch(texts, 1) |
|
141 |
#' |
|
142 |
#' ## the next |
|
143 |
#' lingmatch(texts, "seq") |
|
144 |
#' |
|
145 |
#' ## the set average |
|
146 |
#' lingmatch(texts, mean) |
|
147 |
#' |
|
148 |
#' ## other entries in a group |
|
149 |
#' lingmatch(texts, group = c("a", "a", "b")) |
|
150 |
#' |
|
151 |
#' ## one another, without stop words |
|
152 |
#' lingmatch(texts, exclude = "function") |
|
153 |
#' |
|
154 |
#' ## a standard average (based on function words) |
|
155 |
#' lingmatch(texts, "auto", dict = lma_dict(1:9)) |
|
156 |
#' |
|
157 |
#' @export |
|
158 |
#' @import methods Matrix |
|
159 |
#' @importFrom stats na.omit dpois ppois |
|
160 |
#' @importFrom Rcpp sourceCpp |
|
161 |
#' @importFrom RcppParallel RcppParallelLibs |
|
162 |
#' @useDynLib lingmatch, .registration = TRUE |
|
163 | ||
164 |
lingmatch <- function(input = NULL, comp = mean, data = NULL, group = NULL, ..., comp.data = NULL, comp.group = NULL, order = NULL, |
|
165 |
drop = FALSE, all.levels = FALSE, type = "lsm") { |
|
166 | 80x |
inp <- as.list(substitute(...())) |
167 |
# setting up a default type if specified |
|
168 | 80x |
if (!missing(type) && !is.null(type)) { |
169 | 39x |
type <- if (grepl("lsm|lang|ling|style|match", type, TRUE)) "lsm" else "lsa" |
170 | 39x |
ni <- names(inp) |
171 | 31x |
if (type == "lsm" && !"dict" %in% ni) inp$dict <- lma_dict(1:9) |
172 | 1x |
if (type != "lsm" && !"space" %in% ni) inp$space <- "100k_lsa" |
173 | 39x |
if (!"metric" %in% ni) inp$metric <- if (type == "lsm") "canberra" else "cosine" |
174 | 39x |
if (is.null(attr(input, "type")) || length(attr(input, "type")) == 1) { |
175 | 25x |
if (type == "lsm" && !"percent" %in% ni) inp$percent <- TRUE |
176 | 6x |
if (type != "lsm" && !"weight" %in% ni) inp$weight <- "tfidf" |
177 |
} |
|
178 |
} |
|
179 | 80x |
mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson") |
180 | 80x |
inp$metric <- if (!is.null(inp$metric)) match_metric(inp$metric)$selected else "cosine" |
181 | 1x |
if (!length(inp$metric) || all(inp$metric == "")) inp$metric <- "cosine" |
182 | 80x |
vs <- c("input", "comp", "group", "order", "data", "comp.data", "comp.group") |
183 | 80x |
opt <- as.list(match.call(expand.dots = FALSE))[vs] |
184 | 80x |
names(opt) <- vs |
185 |
# organizing options for preprocessing |
|
186 | 80x |
dsp <- lapply(c("lma_dtm", "lma_weight", "lma_lspace", "lma_termcat", "lma_simets"), function(f) { |
187 | 400x |
a <- names(as.list(args(f))) |
188 | 400x |
a <- a[-c(1, length(a))] |
189 | 400x |
inp[a[a %in% names(inp)]] |
190 |
}) |
|
191 | 80x |
names(dsp) <- c("p", "w", "m", "c", "s") |
192 |
# fetches input from data or environment |
|
193 | 80x |
gv <- function(a, data = NULL) { |
194 | 143x |
ta <- a |
195 | 143x |
if (is.character(a)) { |
196 | ! |
if (!is.null(data) && a %in% colnames(data)) { |
197 | ! |
return(unlist(data[, a])) |
198 | ! |
} else if (length(ta) == 1 || !any(grepl(" ", a, fixed = TRUE))) ta <- parse(text = a) |
199 |
} |
|
200 | 143x |
ta <- tryCatch(eval(ta, parent.frame(2)), error = function(e) NULL) |
201 | 143x |
if (!length(ta) || (!is.null(dim(ta)) && !dim(ta)[1])) { |
202 | 98x |
ta <- tryCatch(eval(a, data, parent.frame(2)), error = function(e) NULL) |
203 | 98x |
if (!length(ta) || (!is.null(dim(ta)) && !dim(ta)[1])) { |
204 | 91x |
ta <- tryCatch(eval(a, globalenv()), error = function(e) NULL) |
205 | 91x |
if (is.null(ta)) { |
206 | 91x |
ta <- tryCatch(eval(a, data), error = function(e) NULL) |
207 | 91x |
if (is.null(ta)) { |
208 | 91x |
p <- 2 |
209 | 91x |
while (is.null(ta) && p < 99) { |
210 | 187x |
p <- p + 1 |
211 | 187x |
ta <- tryCatch(eval(a, parent.frame(p)), error = function(e) NULL) |
212 |
} |
|
213 |
} |
|
214 | 1x |
if (is.null(ta)) stop("could not find ", deparse(a), call. = FALSE) |
215 |
} |
|
216 |
} |
|
217 |
} |
|
218 | 142x |
ta |
219 |
} |
|
220 | 80x |
gd <- function(a, data = NULL) { |
221 | 126x |
r <- if (is.character(a) && length(a) == 1 && grepl("\\.(?:csv|txt|tsv|tab)$", a, TRUE)) { |
222 | 3x |
if (file.exists(a)) { |
223 | 3x |
r <- if (grepl("txt$", a)) { |
224 | 1x |
readLines(a, warn = FALSE) |
225 |
} else { |
|
226 | 2x |
r <- read.table(a, TRUE, if (grepl("csv$", a)) "," else "\t", '"', comment.char = "") |
227 | 2x |
r[, which(!vapply(r, is.numeric, TRUE))[1]] |
228 |
} |
|
229 | 3x |
r[r != ""] |
230 |
} else { |
|
231 | ! |
stop(a, " does not exist", call. = FALSE) |
232 |
} |
|
233 | 126x |
} else if (is.character(a)) a else gv(a, data) |
234 | 4x |
if (is.factor(r)) r <- as.character(r) |
235 | 3x |
if (is.character(r) && length(r) == 1 && grepl("\\.(?:csv|txt|tsv|tab)$", r, TRUE)) r <- gd(r) |
236 | 126x |
r |
237 |
} |
|
238 |
# weight, categorize, and/or map |
|
239 | 80x |
wmc <- function(a) { |
240 | 63x |
if (!is.null(colnames(a)) || (length(dsp$c) == 0 && length(dsp$m) == 0)) { |
241 | 23x |
if (length(dsp$w) != 0) a <- do.call(lma_weight, c(list(a), lapply(dsp$w, eval, parent.frame(2)))) |
242 | 21x |
if (length(dsp$c) != 0) a <- do.call(lma_termcat, c(list(a), lapply(dsp$c, eval, parent.frame(2)))) |
243 | 9x |
if (length(dsp$m) != 0) a <- do.call(lma_lspace, c(list(a), lapply(dsp$m, eval, parent.frame(2)))) |
244 |
} |
|
245 | 63x |
a |
246 |
} |
|
247 |
# initial data parsing |
|
248 |
# input |
|
249 | 80x |
if (missing(input)) { |
250 | 1x |
if (!is.null(data)) { |
251 | 1x |
opt$input <- opt$data |
252 | 1x |
input <- data |
253 |
} else { |
|
254 | ! |
input <- file.choose() |
255 | ! |
opt$input <- input |
256 |
} |
|
257 |
} |
|
258 | 80x |
if (is.function(input) || ((is.list(input) || is.numeric(input)) && is.null(dim(input)))) { |
259 | ! |
stop( |
260 | ! |
"enter a character vector or matrix-like object as input", |
261 | ! |
call. = FALSE |
262 |
) |
|
263 |
} |
|
264 | 77x |
if (missing(data)) data <- input |
265 | 80x |
input <- if (is.character(input) && all(input %in% colnames(data))) data[, input] else gd(opt$input, data) |
266 | 11x |
if (!missing(group) && is.data.frame(input)) input <- as.matrix(input[, vapply(input, is.numeric, TRUE)]) |
267 | 80x |
rx <- NROW(input) |
268 | 80x |
cx <- NCOL(input) |
269 |
# comp |
|
270 | 80x |
if (!missing(comp)) { |
271 | 45x |
comp <- gd(opt$comp, if (missing(comp.data)) if (is.call(opt$comp)) NULL else data else comp.data) |
272 | ! |
if (!missing(comp.data) && is.character(comp) && all(comp %in% colnames(comp.data))) comp <- comp.data[, comp] |
273 | 2x |
if (!missing(data) && is.character(comp) && all(comp %in% colnames(data))) comp <- data[, comp] |
274 | 1x |
if (is.logical(comp)) comp <- which(comp) |
275 | 12x |
if (missing(comp.data) && !is.null(colnames(comp))) comp.data <- comp |
276 | 35x |
} else if (missing(comp) && missing(group) && missing(comp.data) && missing(comp.group)) { |
277 | 27x |
opt$comp <- comp <- "pairwise" |
278 |
} else { |
|
279 | 8x |
opt$comp <- "mean" |
280 |
} |
|
281 | 21x |
if (length(opt$comp) > 1) opt$comp <- deparse(opt$comp) |
282 | 2x |
if (is.factor(input)) input <- as.character(input) |
283 | 80x |
if (is.factor(comp)) { |
284 | 2x |
comp <- as.character(comp) |
285 | 78x |
} else if (is.data.frame(comp)) { |
286 | 3x |
comp <- comp[, vapply(comp, is.numeric, TRUE)] |
287 |
} |
|
288 | 80x |
do.wmc <- TRUE |
289 | 80x |
if ("dict" %in% names(inp) && any(class(input) %in% c("matrix", "data.frame")) && |
290 | 80x |
is.null(attr(input, "Type"))) { |
291 | 17x |
cn <- colnames(input) |
292 | 17x |
dn <- gv(inp$dict) |
293 | 10x |
if (is.list(dn)) dn <- names(dn) |
294 | 17x |
if (any(!(ck <- dn %in% cn))) { |
295 | 2x |
cat_map <- structure(c(rep(colnames(lsm_profiles), 2), "article", "prep"), names = c( |
296 | 2x |
colnames(lsm_profiles), "personal_pronouns", "impersonal_pronouns", "articles", "auxiliary_verbs", |
297 | 2x |
"adverbs", "prepositions", "conjunctions", "negations", "quantifiers", "articles", "preps" |
298 |
)) |
|
299 | 2x |
cn <- sub("^liwc[ .:_-]+", "", tolower(cn)) |
300 | 2x |
tr <- cn %in% names(cat_map) |
301 | 2x |
if (any(tr)) colnames(input)[tr] <- cat_map[cn[tr]] |
302 | 2x |
ck <- dn %in% colnames(input) |
303 |
} |
|
304 | 17x |
if (sum(ck) / length(ck) > .75) { |
305 | 17x |
inp$dict <- NULL |
306 | ! |
if (any(!ck)) dn <- dn[ck] |
307 | 17x |
cx <- length(dn) |
308 | 17x |
input <- input[, dn] |
309 | 17x |
do.wmc <- FALSE |
310 | 17x |
if (!missing(comp) && any(class(comp) %in% c("matrix", "data.frame")) && all(dn %in% colnames(comp))) { |
311 | 3x |
comp <- comp[, dn] |
312 |
} |
|
313 |
} |
|
314 |
} |
|
315 | 80x |
if (!is.matrix(input) && is.character(input)) { |
316 |
# if input looks like text, seeing if other text can be added, then converting to a dtm |
|
317 | 15x |
if (is.character(comp) && (length(comp) > 1 || grepl(" ", comp, fixed = TRUE))) { |
318 | 5x |
input <- c(comp, input) |
319 | 5x |
comp <- seq_along(comp) |
320 | 5x |
opt$comp <- "text" |
321 |
} |
|
322 | 15x |
input <- do.call(lma_dtm, c(list(input), dsp$p)) |
323 |
} |
|
324 | 3x |
if (is.data.frame(comp)) comp <- as.matrix(comp) |
325 | 80x |
cc <- if (is.numeric(comp) && (!is.null(comp.data) || is.null(dim(comp)))) { |
326 | 19x |
1 |
327 | 80x |
} else if (is.character(comp)) { |
328 | 44x |
comp <- tolower(comp) |
329 | 44x |
2 |
330 |
} else { |
|
331 | 17x |
0 |
332 |
} |
|
333 |
# group and order |
|
334 | 80x |
agc <- c("c", "list", "cbind", "data.frame") |
335 | 80x |
if (missing(group) && !missing(comp.group)) { |
336 | 2x |
group <- NULL |
337 | 2x |
opt$group <- opt$comp.group |
338 |
} |
|
339 | 80x |
if (!missing(group) && !(is.null(colnames(data)) && rx == length(opt$group) - 1)) { |
340 | 19x |
group <- if (length(opt$group) > 1 && as.character(opt$group[1]) %in% agc && |
341 | 19x |
!grepl("[$[]", as.character(opt$group[1]))) { |
342 | 5x |
group <- tryCatch(gv(opt$group, data), error = function(e) NULL) |
343 | 5x |
if (is.character(group) && is.null(dim(group)) && all(group %in% colnames(data))) { |
344 | 3x |
group <- data[, group] |
345 |
} |
|
346 | 5x |
if (is.null(group)) lapply(opt$group[-1], gv, data) else group |
347 |
} else { |
|
348 | 14x |
if (!is.null(colnames(data)) && is.character(opt$group) && length(opt$group) < nrow(data)) { |
349 | 2x |
if (!all(opt$group %in% colnames(data))) { |
350 | ! |
stop("group appears to be column names, but were not found in data", call. = FALSE) |
351 |
} |
|
352 | 2x |
group <- data[, opt$group] |
353 | 2x |
if (!is.list(group)) group <- if (is.matrix(group)) as.data.frame(group, stringsAsFactors = FALSE) else list(group) |
354 |
} else { |
|
355 | 12x |
group <- gv(opt$group, data) |
356 | 11x |
if (is.factor(group)) { |
357 | 3x |
group <- as.character(group) |
358 | 8x |
} else if (is.matrix(group)) { |
359 | 1x |
group <- as.data.frame(group, row.names = FALSE, stringsAsFactors = FALSE) |
360 |
} |
|
361 | 3x |
if (is.null(dim(group))) list(group) else lapply(group, as.character) |
362 |
} |
|
363 |
} |
|
364 |
} |
|
365 | 79x |
if (!missing(comp.group) || (!is.null(comp.data) && !missing(group))) { |
366 | 3x |
cg <- opt[[if (missing(comp.group)) "group" else "comp.group"]] |
367 | 3x |
if (!is.null(cg)) { |
368 | 3x |
cg <- if (!is.null(comp.data) && length(cg) > 1 && |
369 | 3x |
as.character(cg[1]) %in% agc && !grepl("[$[]", as.character(cg[1]))) { |
370 | 1x |
cg <- tryCatch(gv(cg, comp.data), error = function(e) NULL) |
371 | 1x |
if (is.character(cg) && all(cg %in% colnames(comp.data))) cg <- comp.data[, cg] |
372 | 1x |
if (is.null(cg)) lapply(as.character(cg[-1]), gv, comp.data) else cg |
373 | 3x |
} else if (is.character(cg)) { |
374 | ! |
if (cg %in% colnames(comp.data)) { |
375 | ! |
list(comp.data[, cg]) |
376 |
} else { |
|
377 | ! |
stop("groups not found in comp.data", call. = FALSE) |
378 |
} |
|
379 |
} else { |
|
380 | 2x |
list(gv(cg, comp.data)) |
381 |
} |
|
382 | 3x |
if (is.list(cg) && length(cg) == 1 && !is.null(dim(cg[[1]]))) { |
383 | ! |
cg <- as.data.frame(cg[[1]], stringsAsFactors = FALSE) |
384 | 3x |
} else if (is.character(cg) && !missing(comp.group) && all(cg %in% colnames(comp.data))) { |
385 | ! |
cg <- comp.data[, cg] |
386 |
} |
|
387 | 3x |
if (!missing(comp.group) || length(if (is.list(cg)) cg[[1]] else cg) == nrow(comp.data)) { |
388 | 3x |
if (all.levels) { |
389 | 1x |
comp.group <- cg |
390 |
} else { |
|
391 | 2x |
comp.group <- do.call(paste, cg) |
392 | 2x |
if (length(group) > 1) { |
393 | ! |
group <- do.call(paste, group) |
394 | ! |
if (!is.null(comp.data) && any(ck <- !(ckg <- unique(group)) %in% unique(comp.group))) { |
395 | ! |
if (all(ck)) { |
396 | ! |
stop("group and comp.group had no levels in common", call. = FALSE) |
397 |
} else { |
|
398 | ! |
warning("levels not found in comp.group: ", paste(ckg[ck], collapse = ", "), call. = FALSE) |
399 | ! |
group <- group[ck <- group %in% ckg[!ck]] |
400 | ! |
input <- input[ck, , drop = FALSE] |
401 |
} |
|
402 |
} |
|
403 |
} |
|
404 |
} |
|
405 |
} |
|
406 |
} |
|
407 |
} |
|
408 | 79x |
if (!missing(group)) { |
409 | 2x |
if (is.matrix(group)) group <- as.data.frame(group) |
410 | 18x |
if (length(if (is.list(group)) group[[1]] else group) != rx) { |
411 | ! |
stop("length(group) != nrow(input)", call. = FALSE) |
412 |
} |
|
413 |
} |
|
414 | 79x |
if (!missing(order)) { |
415 | 1x |
order <- gv(opt$order, data) |
416 | 1x |
if (!is.null(order)) { |
417 | 1x |
if (length(order) == rx) { |
418 | 1x |
input <- input[order, ] |
419 | 1x |
group <- lapply(group, "[", order) |
420 |
} else { |
|
421 | ! |
warning("length(order) != nrow(input), so order was not applied", call. = FALSE) |
422 |
} |
|
423 |
} else { |
|
424 | ! |
warning("failed to apply order", call. = FALSE) |
425 |
} |
|
426 |
} |
|
427 | 1x |
if (is.character(input)) input <- matrix(as.numeric(input), rx) |
428 | 79x |
if (is.data.frame(input) && any(ckvc <- !vapply(input, is.numeric, TRUE))) { |
429 | 1x |
if (all(ckvc)) { |
430 | ! |
for (col in seq_along(ckvc)) input[, col] <- as.numeric(input[, col]) |
431 |
} else { |
|
432 | 1x |
input <- input[, !ckvc] |
433 | 1x |
warning("some input variables were not numeric, so they were removed") |
434 |
} |
|
435 |
} |
|
436 | 79x |
dtm <- Matrix(if (is.data.frame(input)) as.matrix(input) else input, sparse = TRUE) |
437 | 63x |
if (do.wmc) input <- wmc(input) |
438 | ! |
if (is.null(dim(input))) input <- t(as.matrix(input)) |
439 | 79x |
if (cc == 2 && (length(comp) > 1 || any(grepl(" ", comp, fixed = TRUE)))) { |
440 | ! |
comp <- do.call(lma_dtm, c(list(comp), dsp$p)) |
441 | ! |
cc <- 1 |
442 |
} |
|
443 |
# if comp appears to be a dtm, unifying input and comp |
|
444 | 3x |
if (cc == 1 && !is.null(names(comp))) comp <- t(as.matrix(comp)) |
445 | 79x |
cr <- nrow(comp) |
446 | 79x |
cn <- colnames(comp) |
447 | 79x |
if (!is.null(cn)) { |
448 | 15x |
cc <- 1 |
449 | 15x |
nn <- cn[!cn %in% colnames(input)] |
450 | 15x |
if (length(nn) != 0) { |
451 | 1x |
input <- cbind( |
452 | 1x |
input, matrix(0, nrow(input), length(nn), dimnames = list(NULL, nn)) |
453 |
) |
|
454 |
} |
|
455 | 15x |
input <- rbind(matrix(0, cr, ncol(input), dimnames = list(NULL, colnames(input))), input) |
456 | 15x |
input[seq_len(cr), cn] <- as.matrix(comp[seq_len(cr), ]) |
457 | 15x |
comp <- seq_len(cr) |
458 |
} |
|
459 | 79x |
if (drop) { |
460 | 1x |
if (sum(su <- colSums(input, na.rm = TRUE) != 0) != 0) { |
461 | 1x |
input <- input[, su, drop = FALSE] |
462 |
} else { |
|
463 | ! |
stop("input is all 0s after processing", call. = FALSE) |
464 |
} |
|
465 |
} |
|
466 | 79x |
nc <- ncol(input) |
467 |
# finalizing comp |
|
468 | 79x |
if (is.numeric(comp) && (cc == 1 || opt$comp == "text")) { |
469 | 23x |
comp.data <- input[comp, , drop = FALSE] |
470 | 23x |
if (!missing(comp.group) && !all.levels) { |
471 | 1x |
if (!anyDuplicated(comp.group) && nrow(comp.data) == length(comp.group)) { |
472 | ! |
rownames(comp.data) <- comp.group |
473 |
} |
|
474 | 22x |
} else if (nrow(comp.data) == 1) { |
475 | 3x |
comp.data <- structure(as.numeric(comp.data[1, ]), |
476 | 3x |
names = colnames(comp.data) |
477 |
) |
|
478 |
} |
|
479 | 23x |
input <- input[-comp, , drop = FALSE] |
480 | 56x |
} else if (cc == 2) { |
481 | 44x |
ckp <- FALSE |
482 | 44x |
if (grepl("^pa|^se", comp)) { |
483 | 41x |
opt$comp <- if (grepl("^pa", comp)) "pairwise" else "sequential" |
484 | 3x |
} else if (any(!is.na(p <- pmatch(comp, rownames(lsm_profiles))))) { |
485 | 2x |
opt$comp <- rownames(lsm_profiles)[p] |
486 | 2x |
ckp <- TRUE |
487 | 2x |
comp.data <- lsm_profiles[p, , drop = FALSE] |
488 | 1x |
} else if (grepl("^au", comp)) { |
489 | 1x |
p <- colMeans(input, na.rm = TRUE) |
490 | 1x |
p <- which.max(lma_simets(lsm_profiles, p, "pearson")) |
491 | 1x |
opt$comp <- paste("auto:", names(p)) |
492 | 1x |
ckp <- TRUE |
493 | 1x |
comp.data <- lsm_profiles[p, , drop = FALSE] |
494 |
} else { |
|
495 | ! |
opt$comp <- substitute(comp) |
496 |
} |
|
497 | 44x |
if (ckp) { |
498 | 3x |
if (any(ckp <- !(cn <- colnames(input)) %in% (bn <- colnames(comp.data)))) { |
499 | ! |
if (all(ckp)) stop("input and comp have no columns in common", call. = FALSE) |
500 | 2x |
if ("articles" %in% cn && !"articles" %in% bn) bn[bn == "article"] <- "articles" |
501 | 2x |
if ("preps" %in% cn && !"preps" %in% bn) bn[bn == "prep"] <- "preps" |
502 | 2x |
colnames(comp.data) <- bn |
503 | 2x |
if (any(ckp <- !cn %in% bn)) { |
504 | ! |
warning("input columns were not found in comp: ", paste(cn[ckp], collapse = ", "), call. = FALSE) |
505 | ! |
comp.data <- comp.data[, cn[!ckp], drop = FALSE] |
506 |
} |
|
507 |
} else { |
|
508 | 1x |
comp.data <- comp.data[, cn, drop = FALSE] |
509 |
} |
|
510 |
} |
|
511 | 12x |
} else if (!is.null(comp.data)) { |
512 | 2x |
cn <- colnames(input) |
513 | 2x |
cns <- cn[ck <- cn %in% colnames(comp.data)] |
514 | 2x |
if (!any(ck)) { |
515 | ! |
stop("input and comp have no columns in common", call. = FALSE) |
516 | 2x |
} else if (any(!ck)) { |
517 | ! |
warning("input columns were not found in comp: ", paste(cn[!ck], collapse = ", "), call. = FALSE) |
518 | ! |
input <- input[, cns] |
519 |
} |
|
520 | 2x |
comp.data <- comp.data[, cns, drop = FALSE] |
521 |
} |
|
522 | 79x |
compmeanck <- opt$comp == "mean" |
523 | 79x |
sim <- speaker <- NULL |
524 | 79x |
if (!is.null(group)) { |
525 | 18x |
if (!is.null(comp.data) && (NROW(comp.data) == 1 || (is.list(group) && length(group[[1]]) != nrow(input)))) { |
526 | ! |
group <- NULL |
527 | ! |
warning("group does not appear to be meaningful for this comparison, so it was ignored", |
528 | ! |
call. = FALSE |
529 |
) |
|
530 | ! |
} else if (!is.list(group)) group <- list(group) |
531 | 18x |
gl <- length(group) |
532 | 18x |
if (opt$comp == "sequential") { |
533 | 7x |
speaker <- group[[gl]] |
534 | 7x |
group <- if (gl == 1) NULL else group[-gl] |
535 | 7x |
gl <- length(group) |
536 |
} |
|
537 | 18x |
if (gl > 1 && !all.levels) { |
538 | 3x |
group <- list(do.call(paste, group)) |
539 | 3x |
gl <- 1 |
540 |
} |
|
541 | 18x |
if (gl) { |
542 | 15x |
sim <- as.data.frame(group, stringsAsFactors = FALSE) |
543 | 15x |
colnames(sim) <- paste0("g", seq_len(gl)) |
544 | 15x |
for (m in inp$metric) sim[, m] <- NA |
545 | 15x |
mets <- seq_along(inp$metric) + gl |
546 |
} |
|
547 | ! |
} else if (opt$comp == "sequential" && is.null(speaker)) speaker <- seq_len(nrow(input)) |
548 |
# making comparisons |
|
549 | 79x |
sal <- dsp$s |
550 | 79x |
ck_grouppair <- !(!is.null(group) && if (is.null(comp.group)) { |
551 | 12x |
!is.null(rownames(comp.data)) |
552 |
} else { |
|
553 | 3x |
!anyDuplicated(comp.group) |
554 |
}) |
|
555 | 79x |
if (ck_grouppair && !is.logical(sal$mean)) { |
556 | 73x |
sal$mean <- isTRUE(grepl("T", sal$mean, fixed = TRUE)) |
557 |
} |
|
558 | 79x |
ckf <- is.function(comp) |
559 | 79x |
apply_comp <- function(m) { |
560 | 1x |
a <- names(as.list(args(comp))) |
561 | 1x |
if ("na.rm" %in% a) { |
562 | 1x |
apply(m, 2, comp, na.rm = TRUE) |
563 | ! |
} else if ("na.action" %in% a) { |
564 | ! |
apply(m, 2, comp, na.action = na.omit) |
565 |
} else { |
|
566 | ! |
apply(m, 2, comp) |
567 |
} |
|
568 |
} |
|
569 | 79x |
if (is.null(group)) { |
570 | 3x |
if (!is.null(speaker)) sal$group <- speaker |
571 | 64x |
if (!is.null(comp.data)) { |
572 | 25x |
if (ckf) { |
573 | 1x |
opt$comp <- paste(if (length(opt$comp.data) > 1) deparse(opt$comp.data) else opt$comp.data, opt$comp) |
574 | 1x |
sal$b <- comp.data <- if (is.null(dim(comp.data))) { |
575 | ! |
comp.data |
576 | 1x |
} else if (compmeanck) colMeans(comp.data, na.rm = TRUE) else apply_comp(comp.data) |
577 |
} else { |
|
578 | 24x |
sal$b <- comp.data |
579 |
} |
|
580 | 39x |
} else if (ckf) { |
581 | 1x |
sal$b <- comp.data <- if (compmeanck) { |
582 | ! |
colMeans(input, na.rm = TRUE) |
583 |
} else { |
|
584 | 1x |
apply_comp(input) |
585 |
} |
|
586 |
} |
|
587 | 4x |
if (!"b" %in% names(sal) && (is.numeric(comp) || !is.null(dim(comp)))) sal$b <- comp |
588 | 64x |
sim <- do.call(lma_simets, c(list(input), sal)) |
589 |
} else { |
|
590 | 15x |
gs <- as.character(unique(sim[, 1])) |
591 | 15x |
cks <- !is.null(speaker) |
592 | 15x |
ckc <- !is.null(comp.data) |
593 | 15x |
ckp <- cc == 2 && opt$comp == "pairwise" |
594 | 15x |
if (gl == 1) { |
595 | 12x |
if (opt$comp != "pairwise") { |
596 | 9x |
if (opt$comp == "sequential") { |
597 | 4x |
group <- sim[, 1] |
598 | 4x |
sim <- do.call(rbind, lapply(gs, function(g) { |
599 | 32x |
su <- which(group == g) |
600 | 32x |
s <- speaker[su] |
601 | 32x |
r <- if (length(su) < 2 || length(unique(s)) < 2) { |
602 | 10x |
data.frame(group = g, structure(as.list(numeric(length(mets)) + 1), |
603 | 10x |
names = inp$metric |
604 | 10x |
), row.names = paste(su, collapse = ", "), stringsAsFactors = FALSE) |
605 |
} else { |
|
606 | 22x |
sal$group <- s |
607 | 22x |
r <- do.call(lma_simets, c(list(input[su, , drop = FALSE]), sal)) |
608 | 22x |
rs <- as.integer(unlist(strsplit(rownames(r), "[^0-9]+"))) |
609 | 22x |
rownames(r) <- strsplit(do.call(sprintf, c( |
610 | 22x |
paste(gsub("[0-9]+", "%i", rownames(r)), collapse = "|"), as.list(rs - 1 + su[1]) |
611 | 22x |
)), "|", fixed = TRUE)[[1]] |
612 | 22x |
data.frame(group = g, r, stringsAsFactors = FALSE) |
613 |
} |
|
614 |
})) |
|
615 |
} else { |
|
616 | 5x |
if (is.null(sal$pairwise)) sal$pairwise <- ck_grouppair |
617 | 5x |
flat <- ckf || !isTRUE(sal$pairwise) || isTRUE(sal$mean) |
618 | 5x |
sal$return.list <- !flat |
619 | 1x |
if (!flat) fsim <- list() |
620 | 5x |
ckmc <- FALSE |
621 | 5x |
if (!ckc && ckf) { |
622 | 3x |
ckmc <- TRUE |
623 | 3x |
opt$comp <- paste0(if (length(opt$group) == 1) paste(opt$group, ""), "group ", opt$comp) |
624 | 3x |
comp.data <- as.data.frame( |
625 | 3x |
matrix(NA, length(gs), nc, dimnames = list(gs, colnames(input))), |
626 | 3x |
stringsAsFactors = FALSE |
627 |
) |
|
628 |
} |
|
629 | 5x |
for (g in gs) { |
630 | 33x |
su <- sim[, 1] == g |
631 | 33x |
sal$b <- NULL |
632 | 33x |
if (ckc) { |
633 | 4x |
sal$b <- comp.data[if (!is.null(comp.group)) comp.group == g else g, , drop = FALSE] |
634 |
} else { |
|
635 | 29x |
sal$b <- input[su, ] |
636 |
} |
|
637 | 33x |
if (ckf && !is.null(dim(sal$b))) { |
638 | 31x |
sal$b <- if (compmeanck) colMeans(sal$b, na.rm = TRUE) else apply_comp(sal$b) |
639 |
} |
|
640 | 29x |
if (!is.null(sal$b) && ckmc) comp.data[g, ] <- sal$b |
641 | 33x |
if (sum(su) == 1 && is.null(sal$b)) { |
642 | ! |
sim[su, mets] <- 1 |
643 | ! |
next |
644 |
} |
|
645 | 33x |
tm <- do.call(lma_simets, c(list(input[su, , drop = FALSE]), sal)) |
646 | 33x |
if (flat) { |
647 | 31x |
sim[su, mets] <- tm |
648 |
} else { |
|
649 | 2x |
fsim[[g]] <- tm |
650 |
} |
|
651 |
} |
|
652 | 1x |
if (!flat) sim <- fsim |
653 |
} |
|
654 |
} else { |
|
655 | 3x |
ug <- unique(group[[1]]) |
656 | 3x |
if (isTRUE(sal$mean)) { |
657 | 1x |
sim <- data.frame(group[[1]], NA, stringsAsFactors = FALSE) |
658 | 1x |
colnames(sim) <- c(opt$group, sal$metric) |
659 | 1x |
for (g in ug) { |
660 | 2x |
su <- group[[1]] == g |
661 | 2x |
sim[su, -1] <- if (sum(su) == 1) 1 else do.call(lma_simets, c(list(input[su, ]), sal)) |
662 |
} |
|
663 |
} else { |
|
664 | 2x |
sim <- lapply(structure(ug, names = ug), function(g) { |
665 | 6x |
su <- group[[1]] == g |
666 | 6x |
if (sum(su) != 1) { |
667 | 6x |
do.call(lma_simets, c(list(input[su, ]), sal)) |
668 |
} else { |
|
669 | ! |
rep(NA, length(sal$metric)) |
670 |
} |
|
671 |
}) |
|
672 |
} |
|
673 |
} |
|
674 | 3x |
} else if (gl > 1) { |
675 | 3x |
for (i in seq_len(gl - 1)) sim <- cbind(sim, sim[, mets]) |
676 | 3x |
sug <- seq_len(gl) |
677 | 3x |
cn <- paste0("g", sug) |
678 | 3x |
mn <- length(inp$metric) |
679 | 3x |
mw <- seq_len(mn) |
680 | 3x |
colnames(sim)[-sug] <- paste0(rep(vapply(seq_along(cn), function(e) { |
681 | 6x |
paste0(cn[seq_len(e)], collapse = "_") |
682 | 3x |
}, ""), each = mn), "_", inp$metric) |
683 | 3x |
group <- vapply(sug, function(g) do.call(paste, group[seq_len(g)]), character(nrow(sim))) |
684 | 3x |
if (!missing(comp.group)) { |
685 | 1x |
comp.group <- vapply(sug, function(g) { |
686 | 2x |
do.call(paste, comp.group[seq_len(g)]) |
687 | 1x |
}, character(length(comp.group[[1]]))) |
688 |
} |
|
689 | 2x |
if (is.null(sal$pairwise)) sal$pairwise <- ck_grouppair |
690 | 3x |
flat <- ckf || isTRUE(sal$mean) |
691 | 1x |
if (!flat) fsim <- list() |
692 | 3x |
ssl <- if (is.null(speaker)) TRUE else !is.na(speaker) |
693 | 3x |
for (g in unique(sim[, 1])) { |
694 | 2x |
if (!flat && is.null(fsim[[g]])) fsim[[g]] <- list() |
695 | 6x |
su <- which(sim[, 1] == g & ssl) |
696 | 6x |
sg <- group[su, , drop = FALSE] |
697 | 6x |
sx <- input[su, , drop = FALSE] |
698 | 6x |
gck <- ckc && !missing(comp.group) |
699 | 6x |
if (gck) { |
700 | 2x |
gcsub <- comp.group[, 1] == g |
701 | 2x |
if (!any(gcsub)) { |
702 | ! |
warning("the first comparison group has no levels in common with the first data group", |
703 | ! |
call. = FALSE |
704 |
) |
|
705 | ! |
gck <- FALSE |
706 |
} |
|
707 |
} |
|
708 | 6x |
for (s in sug) { |
709 | 12x |
usg <- unique(sg[, s]) |
710 | 12x |
if (length(usg) == 1) { |
711 | 6x |
ssg <- list(sx) |
712 | 6x |
names(ssg) <- usg |
713 |
} else { |
|
714 | 6x |
ssg <- lapply(usg, function(ss) sx[sg[, s] == ss, , drop = FALSE]) |
715 | 6x |
names(ssg) <- usg |
716 |
} |
|
717 | 12x |
if (length(ssg) != 0) { |
718 | 12x |
for (ssn in names(ssg)) { |
719 | 18x |
ssu <- su[sg[, s] == ssn] |
720 | 6x |
if (!flat && is.null(fsim[[g]][[ssn]])) fsim[[g]][[ssn]] <- list() |
721 | 18x |
if (cks) { |
722 | ! |
sal$group <- speaker[ssu] |
723 | 18x |
} else if (ckf && !is.null(dim(ssg[[ssn]]))) { |
724 | 12x |
sal$b <- if (compmeanck) { |
725 | 12x |
colMeans(ssg[[ssn]], na.rm = TRUE) |
726 |
} else { |
|
727 | ! |
apply_comp(ssg[[ssn]]) |
728 |
} |
|
729 |
} |
|
730 | 18x |
csu <- gl + mw + (mn * (s - 1)) |
731 | 18x |
if (gck) { |
732 | 6x |
gcsu <- comp.group[, s] == ssn & gcsub |
733 | 6x |
if (!any(gcsu)) { |
734 | ! |
warning( |
735 | ! |
"no ", paste(usg, collapse = ", "), " level found in the comparison group(s)" |
736 |
) |
|
737 |
} else { |
|
738 | 6x |
sal$b <- comp.data[gcsu, , drop = FALSE] |
739 |
} |
|
740 |
} |
|
741 | 18x |
ssim <- do.call(lma_simets, c(list(ssg[[ssn]]), sal)) |
742 | 18x |
if (flat) { |
743 | 12x |
sim[ssu, csu] <- ssim |
744 |
} else { |
|
745 | 6x |
fsim[[g]][[ssn]][[colnames(sim)[csu]]] <- ssim |
746 |
} |
|
747 |
} |
|
748 |
} |
|
749 |
} |
|
750 |
} |
|
751 | 1x |
if (!flat) sim <- fsim |
752 |
} |
|
753 |
} |
|
754 | 79x |
list( |
755 | 79x |
dtm = dtm, |
756 | 79x |
processed = input, |
757 | 79x |
comp.type = if (!is.null(opt$comp)) { |
758 | 79x |
if (is.character(opt$comp)) { |
759 | 73x |
opt$comp |
760 |
} else { |
|
761 | 6x |
gsub('"', "'", as.character(deparse(opt$comp))) |
762 |
} |
|
763 |
}, |
|
764 | 79x |
comp = comp.data, |
765 | 79x |
group = if (!is.null(opt$group)) { |
766 | 18x |
if (is.character(opt$group)) { |
767 | 2x |
opt$group |
768 |
} else { |
|
769 | 16x |
gsub('"', "'", as.character(deparse(opt$group))) |
770 |
} |
|
771 |
}, |
|
772 | 79x |
sim = sim |
773 |
) |
|
774 |
} |
1 |
.onLoad <- function(lib, pkg) { |
|
2 | ! |
if (is.null(getOption("lingmatch.lspace.dir"))) options(lingmatch.lspace.dir = "") |
3 | ! |
if (is.null(getOption("lingmatch.dict.dir"))) options(lingmatch.dict.dir = "") |
4 |
} |
|
5 | ||
6 |
match_metric <- function(x) { |
|
7 | 445x |
mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson") |
8 | 445x |
sel <- if (is.null(x) || (length(x) == 1 && grepl(tolower(substr(x, 1, 1)), "a", fixed = TRUE))) { |
9 | 21x |
mets |
10 | 445x |
} else if (is.function(x)) { |
11 | ! |
stop("only internal metrics are available: ", paste(mets, collapse = ", "), call. = FALSE) |
12 |
} else { |
|
13 | 424x |
if (is.numeric(x)) { |
14 | 1x |
mets[x] |
15 |
} else { |
|
16 | ! |
if (is.call(x)) x <- eval(x) |
17 | 423x |
su <- grepl("^(?:cor|r)", x, TRUE) |
18 | 2x |
if (any(su)) x[su] <- "pearson" |
19 | 423x |
unique(unlist(lapply(substr(x, 1, 3), grep, mets, fixed = TRUE, value = TRUE))) |
20 |
} |
|
21 |
} |
|
22 | 445x |
list(all = mets, selected = sel, dummy = as.integer(mets %in% sel)) |
23 |
} |
|
24 | ||
25 |
to_regex <- function(dict, intext = FALSE, isGlob = TRUE) { |
|
26 | 23x |
lapply(dict, function(l) { |
27 | 26x |
l <- gsub("([+*])[+*]+", "\\\\\\1+", sub("(?<=[^\\\\])\\\\$", "\\\\\\\\", l, perl = TRUE)) |
28 | 26x |
if (isGlob) { |
29 | 16x |
l <- gsub("([.^$?(){}[-]|\\])", "\\\\\\1", l, perl = TRUE) |
30 | 4x |
if (!intext) l <- gsub("\\^\\*|\\*\\$", "", paste0("^", l, "$")) |
31 | 16x |
l <- gsub("\\*", "[^\\\\s]*", l) |
32 | 10x |
} else if (any(ck <- grepl("[[({]", l) + grepl("[})]|\\]", l) == 1)) { |
33 | ! |
l[ck] <- gsub("([([{}\\])])", "\\\\\\1", l[ck], perl = TRUE) |
34 |
} |
|
35 | 26x |
l |
36 |
}) |
|
37 |
} |
|
38 | ||
39 |
download.resource <- function( |
|
40 |
type, resource, include.terms = TRUE, decompress = TRUE, |
|
41 |
check.md5 = TRUE, mode = "wb", dir = "", overwrite = FALSE) { |
|
42 | 4x |
if (dir == "") { |
43 | ! |
stop(paste0( |
44 | ! |
"specify a directory (dir), or set the ", type, |
45 | ! |
" directory option\n(e.g., options(lingmatch.", type, ".dir = ", |
46 | ! |
'"~/', if (type == "dict") "Dictionaries" else "Latent Semantic Space", |
47 | ! |
'")) or initialize it with lma_initdirs' |
48 | ! |
), call. = FALSE) |
49 |
} |
|
50 | 4x |
all_resources <- rownames(if (type == "dict") dict_info else lss_info) |
51 | ! |
if (length(resource) == 1 && resource == "all") resource <- all_resources |
52 | 4x |
if (length(resource) > 1) { |
53 | 1x |
return(lapply(structure(resource, names = resource), function(d) { |
54 | 2x |
tryCatch( |
55 | 2x |
download.resource( |
56 | 2x |
type = type, resource = d, include.terms = include.terms, decompress = decompress, |
57 | 2x |
check.md5 = check.md5, mode = mode, dir = dir |
58 |
), |
|
59 | 2x |
error = function(e) e$message |
60 |
) |
|
61 |
})) |
|
62 |
} |
|
63 | 3x |
dir <- normalizePath(dir, "/", FALSE) |
64 | 1x |
if (resource == "default") resource <- if (type == "dict") "lusi" else "100k_lsa" |
65 | 3x |
name <- grep(paste0("^", sub("\\..*$", "", resource)), all_resources, value = TRUE) |
66 | 3x |
if (!length(name)) { |
67 | 1x |
name <- grep( |
68 | 1x |
paste0("^", substr(resource, 1, 4)), all_resources, TRUE, |
69 | 1x |
value = TRUE |
70 |
) |
|
71 |
} |
|
72 | 3x |
if (!length(name)) { |
73 | ! |
stop( |
74 | ! |
type, " ", resource, " not recognized; see https://osf.io/", |
75 | ! |
if (type == "dict") "y6g5b" else "489he", "/wiki for available resources" |
76 |
) |
|
77 |
} else { |
|
78 | 3x |
name <- name[1] |
79 |
} |
|
80 | 3x |
urls <- list( |
81 | 3x |
info = function(id) paste0("https://api.osf.io/v2/files/", id), |
82 | 3x |
dl = function(id) paste0("https://osf.io/download/", id), |
83 | 3x |
versions = function(id) paste0("https://osf.io/", id, "/?show=revision") |
84 |
) |
|
85 | ! |
if (!dir.exists(dir)) dir.create(dir, recursive = TRUE) |
86 | 3x |
dl <- function(id, ext) { |
87 | 5x |
s <- urls$dl(id) |
88 | 5x |
o <- unique(normalizePath(paste0( |
89 | 5x |
dir, "/", name, c(ext, sub(".bz2", "", ext, fixed = TRUE)) |
90 | 5x |
), "/", FALSE)) |
91 | 5x |
if (any(file.exists(o))) { |
92 | ! |
if (overwrite) { |
93 | ! |
unlink(o) |
94 |
} else { |
|
95 | ! |
return(-1) |
96 |
} |
|
97 |
} |
|
98 | 5x |
status <- tryCatch(download.file(s, o[[1]], mode = mode), error = function(e) 1) |
99 | 5x |
if (!status && check.md5) { |
100 | 5x |
fi <- strsplit(readLines(urls$info(id), 1, TRUE, FALSE, "utf-8"), '[:,{}"]+')[[1]] |
101 | 5x |
ck <- md5sum(o[[1]]) |
102 | 5x |
if (fi[which(fi == "md5") + 1] != ck) { |
103 | ! |
warning(paste0( |
104 | ! |
"MD5 (", ck, ") does not seem to match the one on record;\n", |
105 | ! |
"double check and try manually downloading at ", urls$versions(id) |
106 |
)) |
|
107 |
} |
|
108 |
} |
|
109 | ! |
if (status) warning("failed to download file from ", s, call. = FALSE) |
110 | 5x |
status |
111 |
} |
|
112 | 3x |
if (type == "lspace") { |
113 | 2x |
status <- if (include.terms) dl(lss_info[name, "osf_terms"], "_terms.txt") else 0 |
114 | 2x |
if (status < 1) status <- dl(lss_info[name, "osf_dat"], ".dat.bz2") |
115 | 2x |
if (status < 1 && decompress) { |
116 | 2x |
if (Sys.which("bunzip2") == "") { |
117 | ! |
warning("could not find path to bunzip2 command for decompression") |
118 |
} else { |
|
119 | 2x |
o <- normalizePath(paste0(dir, "/", name, ".dat.bz2"), "/", FALSE) |
120 | 2x |
if (file.exists(o)) { |
121 | 2x |
status <- tryCatch(system2("bunzip2", shQuote(path.expand(o))), error = function(e) 1) |
122 | 2x |
if (status) { |
123 | ! |
warning( |
124 | ! |
'failed to decompress; might try this from a system console:\n bunzip2 "', path.expand(o), '"' |
125 |
) |
|
126 |
} |
|
127 |
} |
|
128 |
} |
|
129 |
} |
|
130 | 2x |
paths <- normalizePath(paste0( |
131 | 2x |
dir, "/", name, c(".dat", if (!decompress) ".bz2", "_terms.txt") |
132 | 2x |
), "/", FALSE) |
133 |
} else { |
|
134 | 1x |
ext <- if (dict_info[name, "weighted"]) ".csv" else ".dic" |
135 | 1x |
status <- dl(dict_info[name, "osf"], ext) |
136 | 1x |
paths <- normalizePath(paste0(dir, "/", name, ext), "/", FALSE) |
137 |
} |
|
138 | 3x |
if (status < 1) { |
139 | 3x |
message( |
140 | 3x |
paste0(name, " ", type, " ", if (!status) "downloaded" else "exists", ":\n "), |
141 | 3x |
paste(paths, collapse = "\n ") |
142 |
) |
|
143 |
} |
|
144 | 3x |
invisible(paths) |
145 |
} |
1 |
#' Process Text |
|
2 |
#' |
|
3 |
#' A wrapper to other pre-processing functions, potentially from \code{\link{read.segments}}, to \code{\link{lma_dtm}} |
|
4 |
#' or \code{\link{lma_patcat}}, to \code{\link{lma_weight}}, then \code{\link{lma_termcat}} or \code{\link{lma_lspace}}, |
|
5 |
#' and optionally including \code{\link{lma_meta}} output. |
|
6 |
#' |
|
7 |
#' @param input A vector of text, or path to a text file or folder. |
|
8 |
#' @param ... arguments to be passed to \code{\link{lma_dtm}}, \code{\link{lma_patcat}}, \code{\link{lma_weight}}, |
|
9 |
#' \code{\link{lma_termcat}}, and/or \code{\link{lma_lspace}}. All arguments must be named. |
|
10 |
#' @param meta Logical; if \code{FALSE}, metastatistics are not included. Only applies when raw text is available. |
|
11 |
#' If included, meta categories are added as the last columns, with names starting with "meta_". |
|
12 |
#' @param coverage Logical; if \code{TRUE} and a dictionary is provided (\code{dict}), |
|
13 |
#' will calculate the coverage (number of unique term matches) of each dictionary category. |
|
14 |
#' @return A matrix with texts represented by rows, and features in columns, unless there are multiple rows per output |
|
15 |
#' (e.g., when a latent semantic space is applied without terms being mapped) in which case only the special output |
|
16 |
#' is returned (e.g., a matrix with terms as rows and latent dimensions in columns). |
|
17 |
#' @seealso If you just want to compare texts, see the \code{\link{lingmatch}} function. |
|
18 |
#' @examples |
|
19 |
#' # starting with some texts in a vector |
|
20 |
#' texts <- c( |
|
21 |
#' "Firstly, I would like to say, and with all due respect...", |
|
22 |
#' "Please, proceed. I hope you feel you can speak freely...", |
|
23 |
#' "Oh, of course, I just hope to be clear, and not cause offense...", |
|
24 |
#' "Oh, no, don't monitor yourself on my account..." |
|
25 |
#' ) |
|
26 |
#' |
|
27 |
#' # by default, term counts and metastatistics are returned |
|
28 |
#' lma_process(texts) |
|
29 |
#' |
|
30 |
#' # add dictionary and percent arguments for standard dictionary-based results |
|
31 |
#' lma_process(texts, dict = lma_dict(), percent = TRUE) |
|
32 |
#' |
|
33 |
#' # add space and weight arguments for standard word-centroid vectors |
|
34 |
#' lma_process(texts, space = lma_lspace(texts), weight = "tfidf") |
|
35 |
#' @export |
|
36 | ||
37 |
lma_process <- function(input = NULL, ..., meta = TRUE, coverage = FALSE) { |
|
38 | 23x |
inp <- as.list(substitute(...())) |
39 | 23x |
funs <- c("read.segments", "lma_dtm", "lma_weight", "lma_lspace", "lma_termcat", "lma_patcat") |
40 | 23x |
allargs <- NULL |
41 | 23x |
arg_matches <- list() |
42 | 23x |
for (f in funs) { |
43 | 138x |
a <- names(as.list(args(f))) |
44 | 138x |
a <- a[c(FALSE, a[-1] %in% names(inp))] |
45 | 138x |
allargs <- c(allargs, a) |
46 | 138x |
arg_matches[[f]] <- inp[a] |
47 |
} |
|
48 | 23x |
dupargs <- unique(allargs[duplicated(allargs)]) |
49 | 23x |
arg_checks <- vapply(arg_matches, function(l) sum(!names(l) %in% dupargs), 0) |
50 |
# identify input |
|
51 | 23x |
op <- NULL |
52 | 1x |
if (is.function(input)) stop("enter a character vector or matrix-like object as input") |
53 | 2x |
if (is.null(dim(input)) && is.list(input) && is.character(input[[1]])) input <- unlist(input, use.names = FALSE) |
54 | 22x |
if (is.character(input) || is.factor(input)) { |
55 | 12x |
ck_paths <- length(input) != 1 && all(file.exists(input)) |
56 | 12x |
op <- if (length(arg_matches$read.segments) || ck_paths) { |
57 | 1x |
an <- names(arg_matches$read.segments) |
58 | 1x |
if (!any(grepl("path|text", an))) arg_matches$read.segments$path <- input |
59 | 1x |
do.call(read.segments, lapply(arg_matches$read.segments, eval.parent, 2)) |
60 |
} else { |
|
61 | 11x |
data.frame( |
62 | 11x |
text = if (length(input) == 1 && file.exists(input)) readLines(input) else input, stringsAsFactors = FALSE |
63 |
) |
|
64 |
} |
|
65 |
} else { |
|
66 | 10x |
if (is.null(dim(input))) { |
67 | 3x |
if (is.null(names(input)) || (is.list(input) && !all(vapply(input, length, 0) == length(input[[1]])))) { |
68 | 1x |
stop("input is not of a recognized format -- should be text or a dtm-like object") |
69 |
} |
|
70 | 2x |
input <- if (is.list(input)) as.data.frame(input, stringsAsFactors = FALSE) else t(input) |
71 |
} |
|
72 | 9x |
op <- input |
73 |
} |
|
74 |
# process |
|
75 | 21x |
ck_text <- "text" %in% colnames(op) && is.character(op[, "text"]) |
76 | 21x |
ck_changed <- FALSE |
77 | 21x |
if (ck_text) { |
78 | 13x |
if (arg_checks[["lma_patcat"]]) { |
79 | 3x |
if (!"return.dtm" %in% names(arg_matches$lma_patcat) && (coverage || length(arg_matches$lma_weight))) { |
80 | 3x |
arg_matches$lma_patcat$return.dtm <- TRUE |
81 |
} |
|
82 | 3x |
arg_matches$lma_patcat$text <- op[, "text"] |
83 | 3x |
x <- do.call(lma_patcat, lapply(arg_matches$lma_patcat, eval.parent, 2)) |
84 | 3x |
ck_changed <- TRUE |
85 |
} else { |
|
86 | 10x |
arg_matches$lma_dtm$text <- op[, "text"] |
87 | 10x |
x <- do.call(lma_dtm, lapply(arg_matches$lma_dtm, eval.parent, 2)) |
88 | 10x |
ck_changed <- TRUE |
89 |
} |
|
90 |
} else { |
|
91 | 8x |
x <- op |
92 |
} |
|
93 | 21x |
if (arg_checks[["lma_weight"]]) { |
94 | 10x |
arg_matches$lma_weight$dtm <- x |
95 | 10x |
x <- do.call(lma_weight, lapply(arg_matches$lma_weight, eval.parent, 2)) |
96 | 10x |
attr(x, "categories") <- attr(arg_matches$lma_weight$dtm, "categories") |
97 | 10x |
ck_changed <- TRUE |
98 |
} |
|
99 | 21x |
if (!is.null(attr(x, "categories"))) { |
100 | 5x |
categories <- attr(x, "categories") |
101 | 5x |
xc <- as.data.frame( |
102 | 5x |
matrix(0, nrow(op), length(categories), dimnames = list(NULL, names(categories))), |
103 | 5x |
stringsAsFactors = FALSE |
104 |
) |
|
105 | 5x |
if (coverage) { |
106 | 1x |
cxc <- xc |
107 | 1x |
for (cat in names(categories)) { |
108 | 3x |
su <- x[, categories[[cat]], drop = FALSE] |
109 | 3x |
xc[, cat] <- rowSums(su, na.rm = TRUE) |
110 | 3x |
cxc[, cat] <- rowSums(su != 0, na.rm = TRUE) |
111 |
} |
|
112 | 1x |
colnames(cxc) <- paste0("coverage_", colnames(xc)) |
113 | 1x |
xc <- cbind(xc, cxc) |
114 |
} else { |
|
115 | 4x |
for (cat in names(categories)) { |
116 | 48x |
xc[, cat] <- rowSums(x[, categories[[cat]], drop = FALSE], na.rm = TRUE) |
117 |
} |
|
118 |
} |
|
119 | 5x |
x <- xc |
120 | 5x |
ck_changed <- TRUE |
121 | 16x |
} else if (arg_checks[["lma_termcat"]] || (length(arg_matches$lma_termcat) && |
122 | 16x |
any(names(arg_matches$lma_termcat) != "dir"))) { |
123 | 9x |
arg_matches$lma_termcat$coverage <- coverage |
124 | 9x |
arg_matches$lma_termcat$dtm <- x |
125 | 9x |
x <- do.call(lma_termcat, lapply(arg_matches$lma_termcat, eval.parent, 2)) |
126 | 9x |
ck_changed <- TRUE |
127 |
} |
|
128 | 21x |
if (arg_checks[["lma_lspace"]]) { |
129 | 2x |
nr <- NROW(x) |
130 | 2x |
arg_matches$lma_lspace$dtm <- x |
131 | 2x |
x <- do.call(lma_lspace, lapply(arg_matches$lma_lspace, eval.parent, 2)) |
132 | 2x |
if (nrow(x) != nr) { |
133 | 1x |
colnames(x) <- paste0("dim", seq_len(ncol(x))) |
134 | 1x |
return(x) |
135 |
} |
|
136 | 1x |
ck_changed <- TRUE |
137 |
} |
|
138 | 5x |
if (any(grepl("Matrix", class(x), fixed = TRUE))) x <- as.matrix(x) |
139 | 15x |
if (is.matrix(x)) x <- as.data.frame(x, stringsAsFactors = FALSE) |
140 | 20x |
op <- if (ck_text && ck_changed) cbind(op, if (is.null(dim(x))) t(x) else x) else x |
141 | 20x |
if (ck_text && meta) { |
142 | 10x |
opm <- lma_meta(op[, "text"]) |
143 | 10x |
if (arg_checks[["lma_weight"]] && |
144 | 10x |
(!"normalize" %in% names(arg_matches$lma_weight) || arg_matches$lma_weight$normalize)) { |
145 | 3x |
cols <- c(9, 14:23) |
146 | 3x |
opm_counts <- opm[, cols] |
147 | 3x |
su <- opm_counts != 0 |
148 | 3x |
adj <- if ("percent" %in% names(arg_matches$lma_weight) && arg_matches$lma_weight$percent) 100 else 1 |
149 | 3x |
opm_counts[su] <- opm_counts[su] / rep(opm$words, length(cols))[which(su)] * adj |
150 | 3x |
opm[, cols] <- opm_counts |
151 |
} |
|
152 | 10x |
colnames(opm) <- paste0("meta_", colnames(opm)) |
153 | 10x |
op <- cbind(op, opm) |
154 |
} |
|
155 | 20x |
op |
156 |
} |
1 |
#' Standardize a Latent Semantic Space |
|
2 |
#' |
|
3 |
#' Reformat a .rda file which has a matrix with terms as row names, or a plain-text embeddings file |
|
4 |
#' which has a term at the start of each line, and consistent delimiting characters. Plain-text files |
|
5 |
#' are processed line-by-line, so large spaces can be reformatted RAM-conservatively. |
|
6 |
#' |
|
7 |
#' @param infile Name of the .rda or plain-text file relative to \code{dir}, \cr |
|
8 |
#' e.g., "default.rda" or "glove/glove.6B.300d.txt". |
|
9 |
#' @param name Base name of the reformatted file and term file; e.g., "glove" would result in |
|
10 |
#' \code{glove.dat} and \code{glove_terms.txt} in \code{outdir}. |
|
11 |
#' @param sep Delimiting character between values in each line, e.g., \code{" "} or \code{"\\t"}. |
|
12 |
#' Only applies to plain-text files. |
|
13 |
#' @param digits Number of digits to round values to; default is 9. |
|
14 |
#' @param dir Path to folder containing \code{infile}s. \cr Default is \code{getOption('lingmatch.lspace.dir')}, |
|
15 |
#' which must be set in the current session. If this is not specified and \code{infile} is a full path, |
|
16 |
#' \code{dir} will be set to \code{infile}'s parent directory. |
|
17 |
#' @param outdir Path to folder in which to save standardized files; default is \code{dir}. |
|
18 |
#' @param remove A string with a regex pattern to be removed from term names \cr (i.e., \code{gsub(remove,} |
|
19 |
#' \code{"", term)}); default is \code{""}, which is ignored. |
|
20 |
#' @param term_check A string with a regex pattern by which to filter terms; i.e., only lines with fully |
|
21 |
#' matched terms are written to the reformatted file. The default attempts to retain only regular words, including |
|
22 |
#' those with dashes, foreword slashes, and periods. Set to an empty string (\code{""}) to write all lines |
|
23 |
#' regardless of term. |
|
24 |
#' @param verbose Logical: if \code{TRUE}, prints the current line number and its term to the console every 1,000 lines. |
|
25 |
#' Only applies to plain-text files. |
|
26 |
#' @family Latent Semantic Space functions |
|
27 |
#' @return Path to the standardized [1] data file and [2] terms file if applicable. |
|
28 |
#' @examples |
|
29 |
#' \dontrun{ |
|
30 |
#' |
|
31 |
#' # from https://sites.google.com/site/fritzgntr/software-resources/semantic_spaces |
|
32 |
#' standardize.lspace("EN_100k_lsa.rda", "100k_lsa") |
|
33 |
#' |
|
34 |
#' # from https://fasttext.cc/docs/en/english-vectors.html |
|
35 |
#' standardize.lspace("crawl-300d-2M.vec", "facebook_crawl") |
|
36 |
#' |
|
37 |
#' # Standardized versions of these spaces can also be downloaded with download.lspace. |
|
38 |
#' } |
|
39 |
#' @export |
|
40 | ||
41 |
standardize.lspace <- function(infile, name, sep = " ", digits = 9, dir = getOption("lingmatch.lspace.dir"), |
|
42 |
outdir = dir, remove = "", term_check = "^[a-zA-Z]+$|^['a-zA-Z][a-zA-Z.'\\/-]*[a-zA-Z.]$", verbose = FALSE) { |
|
43 | 3x |
if (is.character(infile) && file.exists(infile) && missing(dir)) { |
44 | 1x |
dir <- dirname(normalizePath(infile, mustWork = FALSE)) |
45 | 1x |
if (missing(outdir)) outdir <- dir |
46 | 2x |
} else if (dir == "") { |
47 | ! |
if (outdir != "") { |
48 | ! |
dir <- outdir |
49 |
} else { |
|
50 | ! |
stop(paste( |
51 | ! |
"specify a directory (dir), or set the lspace directory option\n(e.g.,", |
52 | ! |
'options(lingmatch.lspace.dir = ~/Latent Semantic Spaces")) or call lma_initdir()' |
53 | ! |
), call. = FALSE) |
54 |
} |
|
55 |
} |
|
56 | ! |
if (!is.character(term_check)) term_check <- "" |
57 | 3x |
if (is.character(infile)) { |
58 | 3x |
ip <- normalizePath(paste0(dir, "/", infile), "/", FALSE) |
59 | 1x |
if (!file.exists(ip)) ip <- infile |
60 |
} |
|
61 | 3x |
op <- normalizePath(paste0(outdir, "/", name), "/", FALSE) |
62 | 3x |
fs <- op |
63 | 3x |
if (!is.character(infile) || grepl("\\.rda$", infile)) { |
64 | 1x |
if (is.character(infile)) { |
65 | 1x |
f <- load(ip) |
66 | 1x |
o <- get(f) |
67 |
} else { |
|
68 | ! |
o <- infile |
69 |
} |
|
70 | 1x |
o <- round(o, digits) |
71 | 1x |
ot <- rownames(o) |
72 | ! |
if (remove != "") ot <- gsub(remove, "", ot) |
73 | 1x |
if (term_check != "") { |
74 | 1x |
su <- grepl(term_check, ot) |
75 | 1x |
o <- o[su, ] |
76 | 1x |
ot <- ot[su] |
77 |
} |
|
78 | 1x |
fs <- paste0(op, c(".dat", "_terms.txt")) |
79 | 1x |
writeLines(ot, fs[2]) |
80 | 1x |
write(formatC(t(o), digits, 0, "f"), fs[1], ncol(o)) |
81 | 1x |
if (is.character(infile)) rm(f, "o") |
82 |
} else { |
|
83 | ! |
if (!file.exists(ip)) stop("infile does not exist: ", ip) |
84 | 2x |
if (!grepl(term_check, scan(ip, "", 1, sep = sep, quiet = TRUE))) { |
85 | ! |
stop("infile does not appear to start with a term: ", ip) |
86 |
} |
|
87 | 2x |
fs <- paste0(op, ".dat") |
88 | 2x |
reformat_embedding(ip, op, sep, digits, remove, term_check, verbose) |
89 |
} |
|
90 | 3x |
message("created ", op, ".dat\nfrom ", ip) |
91 | 3x |
fs |
92 |
} |
1 |
#' Select Latent Semantic Spaces |
|
2 |
#' |
|
3 |
#' Retrieve information and links to latent semantic spaces |
|
4 |
#' (sets of word vectors/embeddings) available at \href{https://osf.io/489he}{osf.io/489he}, |
|
5 |
#' and optionally download their term mappings (\href{https://osf.io/xr7jv}{osf.io/xr7jv}). |
|
6 |
#' |
|
7 |
#' @param query A character used to select spaces, based on names or other features. |
|
8 |
#' If length is over 1, \code{get.map} is set to \code{TRUE}. Use \code{terms} alone to select |
|
9 |
#' spaces based on term coverage. |
|
10 |
#' @param dir Path to a directory containing \code{lma_term_map.rda} and downloaded spaces; \cr will look in |
|
11 |
#' \code{getOption('lingmatch.lspace.dir')} and \code{'~/Latent Semantic Spaces'} by default. |
|
12 |
#' @param terms A character vector of terms to search for in the downloaded term map, to calculate |
|
13 |
#' coverage of spaces, or select by coverage if \code{query} is not specified. |
|
14 |
#' @param get.map Logical; if \code{TRUE} and \code{lma_term_map.rda} is not found in |
|
15 |
#' \code{dir}, the term map (\href{https://osf.io/xr7jv}{lma_term_map.rda}) is |
|
16 |
#' downloaded and decompressed. |
|
17 |
#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF, |
|
18 |
#' and compares it with that calculated from the downloaded file to check its integrity. |
|
19 |
#' @param mode Passed to \code{\link{download.file}} when downloading the term map. |
|
20 |
#' @return A list with varying entries: |
|
21 |
#' \itemize{ |
|
22 |
#' \item \strong{\code{info}}: The version of \href{https://osf.io/9yzca}{osf.io/9yzca} stored internally; a |
|
23 |
#' \code{data.frame} with spaces as row names, and information about each space in columns: |
|
24 |
#' \itemize{ |
|
25 |
#' \item \strong{\code{terms}}: number of terms in the space |
|
26 |
#' \item \strong{\code{corpus}}: corpus(es) on which the space was trained |
|
27 |
#' \item \strong{\code{model}}: model from which the space was trained |
|
28 |
#' \item \strong{\code{dimensions}}: number of dimensions in the model (columns of the space) |
|
29 |
#' \item \strong{\code{model_info}}: some parameter details about the model |
|
30 |
#' \item \strong{\code{original_max}}: maximum value used to normalize the space; the original |
|
31 |
#' space would be \code{(vectors *} \code{original_max) /} \code{100} |
|
32 |
#' \item \strong{\code{osf_dat}}: OSF id for the \code{.dat} files; the URL would be |
|
33 |
#' https://osf.io/\code{osf_dat} |
|
34 |
#' \item \strong{\code{osf_terms}}: OSF id for the \code{_terms.txt} files; the URL would be |
|
35 |
#' https://osf.io/\code{osf_terms} |
|
36 |
#' \item \strong{\code{wiki}}: link to the wiki for the space |
|
37 |
#' \item \strong{\code{downloaded}}: path to the \code{.dat} file if downloaded, |
|
38 |
#' and \code{''} otherwise. |
|
39 |
#' } |
|
40 |
#' \item \strong{\code{selected}}: A subset of \code{info} selected by \code{query}. |
|
41 |
#' \item \strong{\code{term_map}}: If \code{get.map} is \code{TRUE} or \code{lma_term_map.rda} is found in |
|
42 |
#' \code{dir}, a copy of \href{https://osf.io/xr7jv}{osf.io/xr7jv}, which has space names as |
|
43 |
#' column names, terms as row names, and indices as values, with 0 indicating the term is not |
|
44 |
#' present in the associated space. |
|
45 |
#' } |
|
46 |
#' @family Latent Semantic Space functions |
|
47 |
#' @examples |
|
48 |
#' # just retrieve information about available spaces |
|
49 |
#' spaces <- select.lspace() |
|
50 |
#' spaces$info[1:10, c("terms", "dimensions", "original_max")] |
|
51 |
#' |
|
52 |
#' # retrieve all spaces that used word2vec |
|
53 |
#' w2v_spaces <- select.lspace("word2vec")$selected |
|
54 |
#' w2v_spaces[, c("terms", "dimensions", "original_max")] |
|
55 |
#' |
|
56 |
#' \dontrun{ |
|
57 |
#' |
|
58 |
#' # select spaces by terms |
|
59 |
#' select.lspace(terms = c( |
|
60 |
#' "part-time", "i/o", "'cause", "brexit", "debuffs" |
|
61 |
#' ))$selected[, c("terms", "coverage")] |
|
62 |
#' } |
|
63 |
#' @export |
|
64 | ||
65 |
select.lspace <- function(query = NULL, dir = getOption("lingmatch.lspace.dir"), terms = NULL, |
|
66 |
get.map = FALSE, check.md5 = TRUE, mode = "wb") { |
|
67 | 12x |
if (ckd <- dir == "") dir <- "~/Latent Semantic Spaces" |
68 | 13x |
if (!missing(query) && !is.character(query) && !is.null(colnames(query))) { |
69 | ! |
terms <- colnames(query) |
70 | ! |
query <- NULL |
71 |
} |
|
72 | 13x |
map_path <- normalizePath(paste0(dir, "/lma_term_map.rda"), "/", FALSE) |
73 | 5x |
if (missing(get.map) && (file.exists(map_path) || length(terms) > 1)) get.map <- TRUE |
74 | 13x |
if (!exists("lma_term_map")) lma_term_map <- NULL |
75 | 13x |
if (get.map && !ckd && !(file.exists(map_path) || !is.null(lma_term_map))) { |
76 | 1x |
fi <- tryCatch( |
77 | 1x |
strsplit(readLines("https://api.osf.io/v2/files/xr7jv", 1, TRUE, FALSE, "utf-8"), '[:,{}"]+')[[1]], |
78 | 1x |
error = function(e) NULL |
79 |
) |
|
80 | 1x |
if (!file.exists(map_path) || (!is.null(fi) && md5sum(map_path) != fi[which(fi == "md5") + 1])) { |
81 | 1x |
status <- tryCatch(download.file( |
82 | 1x |
"https://osf.io/download/xr7jv", map_path, |
83 | 1x |
mode = mode |
84 | 1x |
), error = function(e) 1) |
85 | 1x |
if (!status && check.md5 && !is.null(fi)) { |
86 | 1x |
ck <- md5sum(map_path) |
87 | 1x |
if (fi[which(fi == "md5") + 1] == ck) { |
88 | 1x |
load(map_path) |
89 | 1x |
save(lma_term_map, file = map_path, compress = FALSE) |
90 |
} else { |
|
91 | ! |
warning(paste0( |
92 | ! |
"The term map's MD5 (", ck, ") does not seem to match the one on record;\n", |
93 | ! |
"double check and try manually downloading at https://osf.io/xr7jv/?show=revision" |
94 |
)) |
|
95 |
} |
|
96 |
} |
|
97 |
} |
|
98 | 12x |
} else if (!file.exists(map_path) && !is.null(terms)) { |
99 | ! |
stop("The term map could not be found; specify dir or run lma_initdirs('~') to download it", call. = FALSE) |
100 |
} |
|
101 | 13x |
r <- list(info = lss_info, selected = lss_info[NULL, ]) |
102 | 13x |
r$info[, "wiki"] <- paste0("https://osf.io/489he/wiki/", rownames(lss_info)) |
103 | 13x |
r$info[, "downloaded"] <- normalizePath(paste0(dir, "/", rownames(r$info), ".dat"), "/", FALSE) |
104 | 13x |
r$info[!file.exists(r$info[, "downloaded"]), "downloaded"] <- "" |
105 | 13x |
if (get.map) { |
106 | 6x |
if (!is.null(lma_term_map)) { |
107 | 1x |
r$term_map <- lma_term_map |
108 | 5x |
} else if (file.exists(map_path) && is.null(lma_term_map)) { |
109 | 5x |
load(map_path) |
110 | 5x |
r$term_map <- lma_term_map |
111 | 5x |
rm(list = "lma_term_map") |
112 |
} |
|
113 |
} |
|
114 | 13x |
if (!is.null(terms)) { |
115 | 3x |
if (length(terms) > 1 && "term_map" %in% names(r)) { |
116 | 3x |
terms <- tolower(terms) |
117 | 3x |
overlap <- terms[terms %in% rownames(r$term_map)] |
118 | 3x |
if (length(overlap)) { |
119 | 3x |
r$info$coverage <- colSums(r$term_map[overlap, , drop = FALSE] != 0) / length(terms) |
120 | 3x |
r$selected <- r$info[order(r$info$coverage, decreasing = TRUE)[1:5], ] |
121 | 3x |
r$space_terms <- overlap |
122 |
} else { |
|
123 | ! |
warning("no terms were found") |
124 |
} |
|
125 |
} |
|
126 |
} |
|
127 | 13x |
if (!is.null(query)) { |
128 | 8x |
query <- paste0(query, collapse = "|") |
129 | 8x |
if (!length(sel <- grep(query, rownames(lss_info), TRUE))) { |
130 | 3x |
collapsed <- vapply( |
131 | 3x |
seq_len(nrow(lss_info)), |
132 | 3x |
function(r) paste(c(rownames(lss_info)[r], lss_info[r, ]), collapse = " "), "" |
133 |
) |
|
134 | 3x |
if (!length(sel <- grep(query, collapsed, TRUE))) { |
135 | 2x |
sel <- grep(paste(strsplit(query, "[[:space:],|]+")[[1]], collapse = "|"), collapsed, TRUE) |
136 |
} |
|
137 |
} |
|
138 | 8x |
if (length(sel)) r$selected <- r$info[sel, ] |
139 |
} |
|
140 | 13x |
r |
141 |
} |
1 |
#' English Function Word Category and Special Character Lists |
|
2 |
#' |
|
3 |
#' Returns a list of function words based on the Linguistic Inquiry and Word Count 2015 dictionary |
|
4 |
#' (in terms of category names -- words were selected independently), or a list of special characters and patterns. |
|
5 |
#' @param ... Numbers or letters corresponding to category names: ppron, ipron, article, |
|
6 |
#' adverb, conj, prep, auxverb, negate, quant, interrog, number, interjection, or special. |
|
7 |
#' @param as.regex Logical: if \code{FALSE}, lists are returned without regular expression. |
|
8 |
#' @param as.function Logical or a function: if specified and \code{as.regex} is \code{TRUE}, the selected dictionary |
|
9 |
#' will be collapsed to a regex string (terms separated by \code{|}), and a function for matching characters to that |
|
10 |
#' string will be returned. The regex string is passed to the matching function (\code{\link{grepl}} by default) |
|
11 |
#' as a 'pattern' argument, with the first argument of the returned function being passed as an 'x' argument. |
|
12 |
#' See examples. |
|
13 |
#' @note |
|
14 |
#' The \code{special} category is not returned unless specifically requested. It is a list of regular expression |
|
15 |
#' strings attempting to capture special things like ellipses and emojis, or sets of special characters (those outside |
|
16 |
#' of the Basic Latin range; \code{[^\\u0020-\\u007F]}), which can be used for character conversions. |
|
17 |
#' If \code{special} is part of the returned list, \code{as.regex} is set to \code{TRUE}. |
|
18 |
#' |
|
19 |
#' The \code{special} list is always used by both \code{\link{lma_dtm}} and \code{\link{lma_termcat}}. When creating a |
|
20 |
#' dtm, \code{special} is used to clean the original input (so that, by default, the punctuation involved in ellipses |
|
21 |
#' and emojis are treated as different -- as ellipses and emojis rather than as periods and parens and colons and such). |
|
22 |
#' When categorizing a dtm, the input dictionary is passed by the special lists to be sure the terms in the dtm match up |
|
23 |
#' with the dictionary (so, for example, ": (" would be replaced with "repfrown" in both the text and dictionary). |
|
24 |
#' @seealso To score texts with these categories, use \code{\link{lma_termcat}}. |
|
25 |
#' @return A list with a vector of terms for each category, or (when \code{as.function = TRUE}) a function which |
|
26 |
#' accepts an initial "terms" argument (a character vector), and any additional arguments determined by function |
|
27 |
#' entered as \code{as.function} (\code{\link{grepl}} by default). |
|
28 |
#' @examples |
|
29 |
#' # return the full dictionary (excluding special) |
|
30 |
#' lma_dict() |
|
31 |
#' |
|
32 |
#' # return the standard 7 category lsm categories |
|
33 |
#' lma_dict(1:7) |
|
34 |
#' |
|
35 |
#' # return just a few categories without regular expression |
|
36 |
#' lma_dict(neg, ppron, aux, as.regex = FALSE) |
|
37 |
#' |
|
38 |
#' # return special specifically |
|
39 |
#' lma_dict(special) |
|
40 |
#' |
|
41 |
#' # returning a function |
|
42 |
#' is.ppron <- lma_dict(ppron, as.function = TRUE) |
|
43 |
#' is.ppron(c("i", "am", "you", "were")) |
|
44 |
#' |
|
45 |
#' in.lsmcat <- lma_dict(1:7, as.function = TRUE) |
|
46 |
#' in.lsmcat(c("a", "frog", "for", "me")) |
|
47 |
#' |
|
48 |
#' ## use as a stopword filter |
|
49 |
#' is.stopword <- lma_dict(as.function = TRUE) |
|
50 |
#' dtm <- lma_dtm("Most of these words might not be all that relevant.") |
|
51 |
#' dtm[, !is.stopword(colnames(dtm))] |
|
52 |
#' |
|
53 |
#' ## use to replace special characters |
|
54 |
#' clean <- lma_dict(special, as.function = gsub) |
|
55 |
#' clean(c( |
|
56 |
#' "\u201Ccurly quotes\u201D", "na\u00EFve", "typographer\u2019s apostrophe", |
|
57 |
#' "en\u2013dash", "em\u2014dash" |
|
58 |
#' )) |
|
59 |
#' @export |
|
60 | ||
61 |
lma_dict <- function(..., as.regex = TRUE, as.function = FALSE) { |
|
62 | 90x |
cats <- as.character(substitute(...())) |
63 | 90x |
dict <- list( |
64 | 90x |
ppron = c( |
65 | 90x |
"^dae$", "^dem$", "^eir$", "^eirself$", "^em$", "^he$", "^he'", "^her$", "^hers$", "^herself$", "^hes$", |
66 | 90x |
"^him$", "^himself$", "^hir$", "^hirs$", "^hirself$", "^his$", "^hisself$", "^i$", "^i'", "^id$", "^idc$", |
67 | 90x |
"^idgaf$", "^idk$", "^idontknow$", "^idve$", "^iirc$", "^iknow$", "^ikr$", "^ill$", "^ily$", "^im$", "^ima$", |
68 | 90x |
"^imean$", "^imma$", "^ive$", "^lets$", "^let's$", "^me$", "^methinks$", "^mine$", "^my$", "^myself$", "^omfg$", |
69 | 90x |
"^omg$", "^oneself$", "^our$", "^ours", "^she$", "^she'", "^shes$", "^thee$", "^their$", "^their'", "^theirs", |
70 | 90x |
"^them$", "^thems", "^they$", "^they'", "^theyd$", "^theyll$", "^theyve$", "^thine$", "^thou$", "^thoust$", |
71 | 90x |
"^thy$", "^thyself$", "^u$", "^u'", "^ud$", "^ull$", "^ur$", "^ure$", "^us$", "^we$", "^we'", "^weve$", "^y'", |
72 | 90x |
"^ya'", "^yall", "^yins$", "^yinz$", "^you$", "^you'", "^youd$", "^youll$", "^your$", "^youre$", "^yours$", |
73 | 90x |
"^yourself$", "^yourselves$", "^youve$", "^zer$", "^zir$", "^zirs$", "^zirself$", "^zis$" |
74 |
), |
|
75 | 90x |
ipron = c( |
76 | 90x |
"^another$", "^anybo", "^anyone", "^anything", "^dat$", "^de+z$", "^dis$", "^everyb", "^everyone", |
77 | 90x |
"^everything", "^few$", "^it$", "^it'$", "^it'", "^itd$", "^itll$", "^its$", "^itself$", "^many$", "^nobod", |
78 | 90x |
"^nothing$", "^other$", "^others$", "^same$", "^somebo", "^somebody'", "^someone", "^something", "^stuff$", |
79 | 90x |
"^that$", "^that'", "^thatd$", "^thatll$", "^thats$", "^these$", "^these'", "^thesed$", "^thesell$", "^thesere$", |
80 | 90x |
"^thing", "^this$", "^this'", "^thisd$", "^thisll$", "^those$", "^those'", "^thosed$", "^thosell$", "^thosere$", |
81 | 90x |
"^what$", "^what'", "^whatd$", "^whatever$", "^whatll$", "^whats$", "^which", "^who$", "^who'", "^whod$", |
82 | 90x |
"^whoever$", "^wholl$", "^whom$", "^whomever$", "^whos$", "^whose$", "^whosever$", "^whosoever$" |
83 |
), |
|
84 | 90x |
article = c("^a$", "^an$", "^da$", "^teh$", "^the$"), |
85 | 90x |
adverb = c( |
86 | 90x |
"^absolutely$", "^actively$", "^actually$", "^afk$", "^again$", "^ago$", "^ahead$", "^almost$", |
87 | 90x |
"^already$", "^altogether$", "^always$", "^angrily$", "^anxiously$", "^any$", "^anymore$", "^anyway$", |
88 | 90x |
"^anywhere$", "^apparently$", "^automatically$", "^away$", "^awhile$", "^back$", "^badly$", "^barely$", |
89 | 90x |
"^basically$", "^below$", "^brietermsy$", "^carefully$", "^causiously$", "^certainly$", "^clearly$", "^closely$", |
90 | 90x |
"^coldly$", "^commonly$", "^completely$", "^constantly$", "^continually$", "^correctly$", "^coz$", "^currently$", |
91 | 90x |
"^daily$", "^deeply$", "^definitely$", "^definitly$", "^deliberately$", "^desperately$", "^differently$", |
92 | 90x |
"^directly$", "^early$", "^easily$", "^effectively$", "^elsewhere$", "^enough$", "^entirely$", "^equally$", |
93 | 90x |
"^especially$", "^essentially$", "^etc$", "^even$", "^eventually$", "^ever$", "^every$", "^everyday$", |
94 | 90x |
"^everywhere", "^exactly$", "^exclusively$", "^extremely$", "^fairly$", "^far$", "^finally$", "^fortunately$", |
95 | 90x |
"^frequently$", "^fully$", "^further$", "^generally$", "^gently$", "^genuinely$", "^good$", "^greatly$", |
96 | 90x |
"^hardly$", "^heavily$", "^hence$", "^henceforth$", "^hereafter$", "^herein$", "^heretofore$", "^hesitantly$", |
97 | 90x |
"^highly$", "^hither$", "^hopefully$", "^hotly$", "^however$", "^immediately$", "^importantly$", "^increasingly$", |
98 | 90x |
"^incredibly$", "^indeed$", "^initially$", "^instead$", "^intensely$", "^jus$", "^just$", "^largely$", "^lately$", |
99 | 90x |
"^least$", "^legitimately$", "^less$", "^lightly$", "^likely$", "^literally$", "^loudly$", "^luckily$", |
100 | 90x |
"^mainly$", "^maybe$", "^meanwhile$", "^merely$", "^more$", "^moreover$", "^most$", "^mostly$", "^much$", |
101 | 90x |
"^namely$", "^naturally$", "^nearly$", "^necessarily$", "^nervously$", "^never$", "^nevertheless$", "^no$", |
102 | 90x |
"^nonetheless$", "^normally$", "^not$", "^notwithstanding$", "^obviously$", "^occasionally$", "^often$", "^once$", |
103 | 90x |
"^only$", "^originally$", "^otherwise$", "^overall$", "^particularly$", "^passionately$", "^perfectly$", |
104 | 90x |
"^perhaps$", "^personally$", "^physically$", "^please$", "^possibly$", "^potentially$", "^practically$", |
105 | 90x |
"^presently$", "^previously$", "^primarily$", "^probability$", "^probably$", "^profoundly$", "^prolly$", |
106 | 90x |
"^properly$", "^quickly$", "^quietly$", "^quite$", "^randomly$", "^rarely$", "^rather$", "^readily$", "^really$", |
107 | 90x |
"^recently$", "^regularly$", "^relatively$", "^respectively$", "^right$", "^roughly$", "^sadly$", "^seldomly$", |
108 | 90x |
"^seriously$", "^shortly$", "^significantly$", "^similarly$", "^simply$", "^slightly$", "^slowly$", "^so$", |
109 | 90x |
"^some$", "^somehow$", "^sometimes$", "^somewhat$", "^somewhere$", "^soon$", "^specifically$", "^still$", |
110 | 90x |
"^strongly$", "^subsequently$", "^successfully$", "^such$", "^suddenly$", "^supposedly$", "^surely$", |
111 | 90x |
"^surprisingly$", "^technically$", "^terribly$", "^thence$", "^thereafter$", "^therefor$", "^therefore$", |
112 | 90x |
"^thither$", "^thoroughly$", "^thus$", "^thusfar$", "^thusly$", "^together$", "^too$", "^totally$", "^truly$", |
113 | 90x |
"^typically$", "^ultimately$", "^uncommonly$", "^unfortunately$", "^unfortunatly$", "^usually$", "^vastly$", |
114 | 90x |
"^very$", "^virtually$", "^well$", "^whence$", "^where", "^wherefor", "^whither$", "^wholly$", "^why$", "^why'", |
115 | 90x |
"^whyd$", "^whys$", "^widely$", "^wither$", "^yet$" |
116 |
), |
|
117 | 90x |
conj = c( |
118 | 90x |
"^also$", "^altho$", "^although$", "^and$", "^b/c$", "^bc$", "^because$", "^besides$", "^both$", "^but$", |
119 | 90x |
"^'cause$", "^cos$", "^cuz$", "^either$", "^else$", "^except$", "^for$", "^how$", "^how'", "^howd$", "^howll$", |
120 | 90x |
"^hows$", "^if$", "^neither$", "^nor$", "^or$", "^than$", "^tho$", "^though$", "^unless$", "^unlike$", "^versus$", |
121 | 90x |
"^vs$", "^when$", "^when'", "^whenever$", "^whereas$", "^whether$", "^while$", "^whilst$" |
122 |
), |
|
123 | 90x |
prep = c( |
124 | 90x |
"^about$", "^above$", "^abt$", "^across$", "^acrost$", "^afk$", "^after$", "^against$", "^along$", "^amid", |
125 | 90x |
"^among", "^around$", "^as$", "^at$", "^atop$", "^before$", "^behind$", "^beneath$", "^beside$", "^betwe", |
126 | 90x |
"^beyond$", "^by$", "^despite$", "^down$", "^during$", "^excluding$", "^from$", "^here$", "^here'", "^heres$", |
127 | 90x |
"^in$", "^including$", "^inside$", "^into$", "^minus$", "^near$", "^now$", "^of$", "^off$", "^on$", "^onto$", |
128 | 90x |
"^out$", "^outside$", "^over$", "^plus$", "^regarding$", "^sans$", "^since$", "^then$", "^there$", "^there'", |
129 | 90x |
"^thered$", "^therell$", "^theres$", "^through$", "^throughout$", "^thru$", "^til$", "^till$", "^to$", "^toward", |
130 | 90x |
"^under$", "^underneath$", "^until$", "^untill$", "^unto$", "^up$", "^upon$", "^via$", "^with$", "^within$", |
131 | 90x |
"^without$", "^worth$" |
132 |
), |
|
133 | 90x |
auxverb = c( |
134 | 90x |
"^am$", "^are$", "^arent$", "^aren't$", "^be$", "^been$", "^bein$", "^being$", "^brb$", "^can$", |
135 | 90x |
"^could$", "^could'", "^couldnt$", "^couldn't$", "^couldve$", "^did$", "^didnt$", "^didn't$", "^do$", "^does$", |
136 | 90x |
"^doesnt$", "^doesn't$", "^doing$", "^dont$", "^don't$", "^had$", "^hadnt$", "^hadn't$", "^has$", "^hasnt$", |
137 | 90x |
"^hasn't$", "^have$", "^havent$", "^haven't$", "^having$", "^is$", "^isnt$", "^isn't$", "^may$", "^might$", |
138 | 90x |
"^might'", "^mightnt$", "^mightn't$", "^mightve$", "^must$", "^mustnt$", "^mustn't$", "^mustve$", "^ought", |
139 | 90x |
"^shant$", "^shan't$", "^sha'nt$", "^shall$", "^should$", "^shouldnt$", "^shouldn't$", "^shouldve$", "^was$", |
140 | 90x |
"^wasnt$", "^wasn't$", "^were$", "^werent$", "^weren't$", "^will$", "^would$", "^would'", "^wouldnt", "^wouldn't", |
141 | 90x |
"^wouldve$" |
142 |
), |
|
143 | 90x |
negate = c( |
144 | 90x |
"^ain't$", "^aint$", "^aren't$", "^arent$", "^can't$", "^cannot$", "^cant$", "^couldn't$", "^couldnt$", |
145 | 90x |
"^didn't$", "^didnt$", "^doesn't$", "^doesnt$", "^don't$", "^dont$", "^hadn't$", "^hadnt$", "^hasn't$", "^hasnt$", |
146 | 90x |
"^haven't$", "^havent$", "^idk$", "^isn't$", "^isnt$", "^must'nt$", "^mustn't$", "^mustnt$", "^nah", "^need'nt$", |
147 | 90x |
"^needn't$", "^neednt$", "^negat", "^neither$", "^never$", "^no$", "^nobod", "^noes$", "^none$", "^nope$", |
148 | 90x |
"^nor$", "^not$", "^nothing$", "^nowhere$", "^np$", "^ought'nt$", "^oughtn't$", "^oughtnt$", "^shant$", |
149 | 90x |
"^shan't$", "^sha'nt$", "^should'nt$", "^shouldn't$", "^shouldnt$", "^uh-uh$", "^wasn't$", "^wasnt$", "^weren't$", |
150 | 90x |
"^werent$", "^without$", "^won't$", "^wont$", "^wouldn't$", "^wouldnt$" |
151 |
), |
|
152 | 90x |
quant = c( |
153 | 90x |
"^add$", "^added$", "^adding$", "^adds$", "^all$", "^allot$", "^alot$", "^amount$", "^amounts$", |
154 | 90x |
"^another$", "^any$", "^approximat", "^average$", "^bit$", "^bits$", "^both$", "^bunch$", "^chapter$", "^couple$", |
155 | 90x |
"^doubl", "^each$", "^either$", "^entire", "^equal", "^every$", "^extra$", "^few$", "^fewer$", "^fewest$", |
156 | 90x |
"^group", "^inequal", "^least$", "^less$", "^lot$", "^lotof$", "^lots$", "^lotsa$", "^lotta$", "^majority$", |
157 | 90x |
"^many$", "^mo$", "^mo'", "^more$", "^most$", "^much$", "^mucho$", "^multiple$", "^nada$", "^none$", "^part$", |
158 | 90x |
"^partly$", "^percent", "^piece$", "^pieces$", "^plenty$", "^remaining$", "^sampl", "^scarce$", "^scarcer$", |
159 | 90x |
"^scarcest$", "^section$", "^segment", "^series$", "^several", "^single$", "^singles$", "^singly$", "^some$", |
160 | 90x |
"^somewhat$", "^ton$", "^tons$", "^total$", "^triple", "^tripling$", "^variety$", "^various$", "^whole$" |
161 |
), |
|
162 | 90x |
interrog = c( |
163 | 90x |
"^how$", "^how'd$", "^how're$", "^how's$", "^howd$", "^howre$", "^hows$", "^wat$", "^wattt", "^what$", |
164 | 90x |
"^what'd$", "^what'll$", "^what're$", "^what's$", "^whatd$", "^whatever$", "^whatll$", "^whatre$", "^whatt", |
165 | 90x |
"^when$", "^when'", "^whence$", "^whenever$", "^where$", "^where'd$", "^where's$", "^wherefore$", "^wherever$", |
166 | 90x |
"^whether$", "^which$", "^whichever$", "^whither$", "^who$", "^who'd$", "^who'll$", "^who's$", "^whoever$", |
167 | 90x |
"^wholl$", "^whom$", "^whomever$", "^whos$", "^whose$", "^whosever$", "^whoso", "^why$", "^why'", "^whyever$", |
168 | 90x |
"^wut$" |
169 |
), |
|
170 | 90x |
number = c( |
171 | 90x |
"^billion", "^doubl", "^dozen", "^eight", "^eleven$", "^fift", "^first$", "^firstly$", "^firsts$", |
172 | 90x |
"^five$", "^four", "^half$", "^hundred", "^infinit", "^million", "^nine", "^once$", "^one$", "^quarter", |
173 | 90x |
"^second$", "^seven", "^single$", "^six", "^ten$", "^tenth$", "^third$", "^thirt", "^thousand", "^three$", |
174 | 90x |
"^trillion", "^twel", "^twent", "^twice$", "^two$", "^zero$", "^zillion" |
175 |
), |
|
176 | 90x |
interjection = c( |
177 | 90x |
"^a+h+$", "^a+w+$", "^allas$", "^alright", "^anyhoo$", "^anyway[ysz]", "^bl[eh]+$", "^g+[eah]+$", |
178 | 90x |
"^h[ah]+$", "^h[hu]+$", "^h[mh]+$", "^l[ol]+$", "^m[hm]+$", "^meh$", "^o+h+$", "^o+k+$", "^okie", "^oo+f+$", |
179 | 90x |
"^soo+$", "^u[uh]+$", "^u+g+h+$", "^w[ow]+$", "^wee+ll+$", "^y[aes]+$", "^ya+h+$", "^yeah$", "^yus+$" |
180 |
), |
|
181 | 90x |
special = list( |
182 | 90x |
ELLIPSIS = "\\.{3, }|\\. +\\. +[. ]+", |
183 | 90x |
SMILE = "\\s(?:[[{(<qd]+[\\s<-]*[;:8=]|[;:8=][\\s>-]*[]})>Dpb]+|[uUnwWmM^=+-]_[uUnwWmM^=+-])(?=\\s)", |
184 | 90x |
FROWN = "\\s(?:[]D)}>]+[\\s.,<-]*[;:8=]|[;:8=][\\s.,>-]*[[{(<]+|[Tt:;]_[Tt;:]|[uUtT;:][mMn][uUtT;:])(?=\\s)", |
185 | 90x |
LIKE = c( |
186 | 90x |
"(?<=could not) like\\b", "(?<=did not) like\\b", "(?<=did) like\\b", "(?<=didn't) like\\b", |
187 | 90x |
"(?<=do not) like\\b", "(?<=do) like\\b", "(?<=does not) like\\b", "(?<=does) like\\b", "(?<=doesn't) like\\b", |
188 | 90x |
"(?<=don't) like\\b", "(?<=i) like\\b", "(?<=should not) like\\b", "(?<=they) like\\b", "(?<=we) like\\b", |
189 | 90x |
"(?<=will not) like\\b", "(?<=will) like\\b", "(?<=won't) like\\b", "(?<=would not) like\\b", |
190 | 90x |
"(?<=you) like\\b" |
191 |
), |
|
192 | 90x |
CHARACTERS = c( |
193 | 90x |
` ` = "\\s", |
194 | 90x |
`'` = paste0( |
195 | 90x |
"[\u00B4\u2018\u2019\u201A\u201B\u2032\u2035\u02B9\u02BB\u02BE\u02BF\u02C8\u02CA\u02CB\u02F4", |
196 | 90x |
"\u0300\u0301\u030D\u0312\u0313\u0314\u0315\u031B\u0321\u0322\u0326\u0328\u0329\u0340\u0341\u0343\u0351", |
197 | 90x |
"\u0357]" |
198 |
), |
|
199 | 90x |
`"` = "[\u201C\u201D\u201E\u201F\u2033\u2034\u2036\u2037\u2057\u02BA\u02DD\u02EE\u02F5\u02F6\u030B\u030F]", |
200 | 90x |
`...` = "\u2026", |
201 | 90x |
`-` = "[\u05BE\u1806\u2010\u2011\u2013\uFE58\uFE63\uFF0D]", |
202 | 90x |
` - ` = "[\u2012\u2014\u2015\u2E3A\u2E3B]|--+", |
203 | 90x |
a = paste0( |
204 | 90x |
"[\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u0100\u0101\u0102", |
205 | 90x |
"\u0103\u0104\u105\u0200\u0201\u0202\u0203\u0226\u0227\u0245\u0250\u0251\u0252\u0255\u0363\u0386\u0391", |
206 | 90x |
"\u0410\u0430]" |
207 |
), |
|
208 | 90x |
ae = "[\u00C6\u00E6\u0152\u0153\u0276]", |
209 | 90x |
b = paste0( |
210 | 90x |
"[\u00DF\u0180\u0181\u0182\u0183\u0184\u0185\u0186\u0187\u0188\u0189\u018A\u018B\u018C\u0243", |
211 | 90x |
"\u0253\u0299\u0411\u0412\u0431\u0432\u0462\u0463\u0494\u0495\u212C]" |
212 |
), |
|
213 | 90x |
c = paste0( |
214 | 90x |
"[\u00C7\u00E7\u0106\u0107\u0108\u0109\u0186\u0187\u0188\u0254\u0297\u0368\u0421\u0441\u2102", |
215 | 90x |
"\u2103]" |
216 |
), |
|
217 | 90x |
d = paste0( |
218 | 90x |
"[\u00D0\u00DE\u00FE\u010D\u010E\u010F\u0110\u0111\u0189\u0221\u0256\u0256\u0257\u0369\u0392", |
219 | 90x |
"\u0434\u0500\u2145\u2146]" |
220 |
), |
|
221 | 90x |
e = paste0( |
222 | 90x |
"[\u00C8\u00C9\u00CA\u00CB\u00E8\u00E9\u00EA\u00EB\u0112\u0113\u0114\u0115\u0116\u0117\u0118", |
223 | 90x |
"\u0119\u011A\u011B\u018E\u018F\u0190\u0204\u0205\u0206\u0207\u0228\u0229\u0246\u0247\u0258\u0259\u0364", |
224 | 90x |
"\u0388\u0395\u0400\u0401\u0404\u0415\u0417\u0435\u0437\u0450\u0451\u0454\u0498\u0499\u2107\u2108\u2128", |
225 | 90x |
"\u212E\u212F\u2130\u2147]" |
226 |
), |
|
227 | 90x |
f = "[\u0191\u0192\u0492\u0493\u2109\u2231\u2132\u214E]", |
228 | 90x |
g = "[\u011C\u011D\u011E\u011F\u0120\u0121\u0122\u0123\u0193\u0222\u0260\u0261\u0262\u210A\u2141]", |
229 | 90x |
h = "[\u0124\u0125\u0127\u0195\u0266\u0267\u0389\u0397\u0452\u210B\u210C\u210D\u210E\u210F]", |
230 | 90x |
i = paste0( |
231 | 90x |
"[\u00CC\u00CD\u00CE\u00CF\u00EC\u00ED\u00EE\u00EF\u0128\u0129\u012A\u012B\u012C\u012D\u012E\u012F", |
232 | 90x |
"\u0130\u0131\u0197\u019A\u0208\u0209\u0365\u0390\u0399\u0406\u0407\u0456\u0457]" |
233 |
), |
|
234 | 90x |
j = "[\u0135\u0236\u0237\u0248\u0249\u0408\u0458\u2129\u2139\u2149]", |
235 | 90x |
k = "[\u0137\u0138\u0198\u0199\u212A]", |
236 | 90x |
l = "[\u0139\u013A\u013B\u013C\u013D\u013E\u013F\u0140\u0141\u0142\u0234]", |
237 | 90x |
m = "[\u0271\u0460\u2133]", |
238 | 90x |
n = paste0( |
239 | 90x |
"[\u00D1\u00F1\u0143\u0144\u0145\u0146\u0147\u0148\u0149\u014A\u014B\u0220\u0235\u0272\u0273", |
240 | 90x |
"\u0274\u0376\u0377\u0418\u0419\u0438\u0439\u2115\u2135]" |
241 |
), |
|
242 | 90x |
h = "\u0149", |
243 | 90x |
o = paste0( |
244 | 90x |
"[\u00D2\u00D3\u00D4\u00D5\u00D6\u00D8\u00F0\u00F2\u00F3\u00F4\u00F5\u00F6\u00F8\u014C\u014D", |
245 | 90x |
"\u014E\u014F\u0150\u0151\u0150\u0151\u0230\u0231\u0275\u0298\u0366\u0398\u0424\u0444\u0472\u0473\u2134]" |
246 |
), |
|
247 | 90x |
p = "[\u0420\u0440\u2117\u2118\u2119]", |
248 | 90x |
q = "[\u018D\u211A\u213A]", |
249 | 90x |
r = paste0( |
250 | 90x |
"[\u0154\u0155\u0156\u0157\u0158\u0159\u0211\u0212\u0213\u0279\u0280\u0281\u0433\u0453\u0490", |
251 | 90x |
"\u0491\u211B\u211C\u211D\u211F\u213E]" |
252 |
), |
|
253 | 90x |
s = "[\u015A\u015C\u015D\u015E\u015F\u0160\u0161\u0160\u0161\u0218\u0219\u0405\u0455]", |
254 | 90x |
t = "[\u0162\u0163\u0164\u0165\u0166\u0167\u0371\u0373\u0422\u0442]", |
255 | 90x |
u = paste0( |
256 | 90x |
"[\u00D9\u00DA\u00DB\u00DC\u00F9\u00FA\u00FB\u00FC\u00FC\u0168\u0169\u016A\u016B\u016C\u016D", |
257 | 90x |
"\u016E\u016F\u0170\u0171\u0172\ |