1 |
#' Document-Term Matrix Categorization |
|
2 |
#' |
|
3 |
#' Reduces the dimensions of a document-term matrix by dictionary-based categorization. |
|
4 |
#' @param dtm A matrix with terms as column names. |
|
5 |
#' @param dict The name of a provided dictionary |
|
6 |
#' (\href{https://osf.io/y6g5b/wiki/home}{osf.io/y6g5b/wiki}) or of a file found in |
|
7 |
#' \code{dir}, or a \code{list} object with named character vectors as word lists, |
|
8 |
#' or the path to a file to be read in by \code{\link{read.dic}}. |
|
9 |
#' @param term.weights A \code{list} object with named numeric vectors lining up with the character |
|
10 |
#' vectors in \code{dict}, used to weight the terms in each \code{dict} vector. If a category in |
|
11 |
#' \code{dict} is not specified in \code{term.weights}, or the \code{dict} and \code{term.weights} |
|
12 |
#' vectors aren't the same length, the weights for that category will be 1. |
|
13 |
#' @param bias A list or named vector specifying a constant to add to the named category. If a term |
|
14 |
#' matching \code{bias.name} is included in a category, it's associated \code{weight} will be used |
|
15 |
#' as the \code{bias} for that category. |
|
16 |
#' @param bias.name A character specifying a term to be used as a category bias; default is |
|
17 |
#' \code{'_intercept'}. |
|
18 |
#' @param escape Logical indicating whether the terms in \code{dict} should not be treated as plain |
|
19 |
#' text (including asterisk wild cards). If \code{TRUE}, regular expression related characters are |
|
20 |
#' escaped. Set to \code{TRUE} if you get PCRE compilation errors. |
|
21 |
#' @param partial Logical; if \code{TRUE} terms are partially matched (not padded by ^ and $). |
|
22 |
#' @param glob Logical; if \code{TRUE} (default), will convert initial and terminal asterisks to |
|
23 |
#' partial matches. |
|
24 |
#' @param to.lower Logical; if \code{TRUE} will lowercase dictionary terms. Otherwise, dictionary |
|
25 |
#' terms will be converted to match the terms if they are single-cased. Set to \code{FALSE} to |
|
26 |
#' always keep dictionary terms as entered. |
|
27 |
#' @param term.filter A regular expression string used to format the text of each term (passed to |
|
28 |
#' \code{gsub}). For example, if terms are part-of-speech tagged (e.g., |
|
29 |
#' \code{'a_DT'}), \code{'_.*'} would remove the tag. |
|
30 |
#' @param term.break If a category has more than \code{term.break} characters, it will be processed |
|
31 |
#' in chunks. Reduce from 20000 if you get a PCRE compilation error. |
|
32 |
#' @param dir Path to a folder in which to look for \code{dict}; \cr |
|
33 |
#' will look in \code{'~/Dictionaries'} by default. \cr |
|
34 |
#' Set a session default with \code{options(lingmatch.dict.dir = 'desired/path')}. |
|
35 |
#' @param coverage Logical; if \code{TRUE}, will calculate coverage |
|
36 |
#' (number of unique term matches) for each category. |
|
37 |
#' @seealso For applying pattern-based dictionaries (to raw text) see \code{\link{lma_patcat}()}. |
|
38 |
#' @family Dictionary functions |
|
39 |
#' @return A matrix with a row per \code{dtm} row and columns per dictionary category |
|
40 |
#' (with added \code{coverage_} versions if \code{coverage} is \code{TRUE}), |
|
41 |
#' and a \code{WC} attribute with original word counts. |
|
42 |
#' @examples |
|
43 |
#' dict <- list(category = c("cat", "dog", "pet*")) |
|
44 |
#' lma_termcat(c( |
|
45 |
#' "cat, cat, cat, cat, cat, cat, cat, cat", |
|
46 |
#' "a cat, dog, or anything petlike, really", |
|
47 |
#' "petite petrochemical petitioned petty peter for petrified petunia petals" |
|
48 |
#' ), dict, coverage = TRUE) |
|
49 |
#' |
|
50 |
#' \dontrun{ |
|
51 |
#' |
|
52 |
#' # Score texts with the NRC Affect Intensity Lexicon |
|
53 |
#' |
|
54 |
#' dict <- readLines("https://saifmohammad.com/WebDocs/NRC-AffectIntensity-Lexicon.txt") |
|
55 |
#' dict <- read.table( |
|
56 |
#' text = dict[-seq_len(grep("term\tscore", dict, fixed = TRUE)[[1]])], |
|
57 |
#' col.names = c("term", "weight", "category") |
|
58 |
#' ) |
|
59 |
#' |
|
60 |
#' text <- c( |
|
61 |
#' angry = paste( |
|
62 |
#' "We are outraged by their hateful brutality,", |
|
63 |
#' "and by the way they terrorize us with their hatred." |
|
64 |
#' ), |
|
65 |
#' fearful = paste( |
|
66 |
#' "The horrific torture of that terrorist was tantamount", |
|
67 |
#' "to the terrorism of terrorists." |
|
68 |
#' ), |
|
69 |
#' joyous = "I am jubilant to be celebrating the bliss of this happiest happiness.", |
|
70 |
#' sad = paste( |
|
71 |
#' "They are nearly suicidal in their mourning after", |
|
72 |
#' "the tragic and heartbreaking holocaust." |
|
73 |
#' ) |
|
74 |
#' ) |
|
75 |
#' |
|
76 |
#' emotion_scores <- lma_termcat(text, dict) |
|
77 |
#' if (require("splot")) splot(emotion_scores ~ names(text), leg = "out") |
|
78 |
#' |
|
79 |
#' ## or use the standardized version (which includes more categories) |
|
80 |
#' |
|
81 |
#' emotion_scores <- lma_termcat(text, "nrc_eil", dir = "~/Dictionaries") |
|
82 |
#' emotion_scores <- emotion_scores[, c("anger", "fear", "joy", "sadness")] |
|
83 |
#' if (require("splot")) splot(emotion_scores ~ names(text), leg = "out") |
|
84 |
#' } |
|
85 |
#' @export |
|
86 | ||
87 |
lma_termcat <- function(dtm, dict, term.weights = NULL, bias = NULL, bias.name = "_intercept", |
|
88 |
escape = TRUE, partial = FALSE, glob = TRUE, term.filter = NULL, term.break = 2e4, |
|
89 |
to.lower = FALSE, dir = getOption("lingmatch.dict.dir"), coverage = FALSE) { |
|
90 | 94x |
st <- proc.time()[[3]] |
91 | 94x |
if (ckd <- dir == "") dir <- "~/Dictionaries" |
92 | 10x |
if (missing(dict)) dict <- lma_dict(1:9) |
93 | ! |
if (is.factor(dict)) dict <- as.character(dict) |
94 | 94x |
if (is.character(dict) && length(dict) == 1 && missing(term.weights) && (file.exists(dict) || !grepl("\\s", dict))) { |
95 | ! |
if (!file.exists(dict) && any(file.exists(normalizePath(paste0(dir, "/", dict), "/", FALSE)))) { |
96 | ! |
dict <- normalizePath(paste0(dir, "/", dict)) |
97 |
} |
|
98 | ! |
td <- tryCatch(read.dic(dict, dir = if (ckd) "" else dir), error = function(e) NULL) |
99 | ! |
dict <- if (is.null(td)) list(cat1 = dict) else td |
100 |
} |
|
101 | 94x |
if (!is.null(dim(dict))) { |
102 | 11x |
if (!is.null(term.weights)) { |
103 | 2x |
if (is.character(term.weights) && any(su <- term.weights %in% colnames(dict))) { |
104 | 2x |
term.weights <- dict[, term.weights[su], drop = FALSE] |
105 |
} |
|
106 | 2x |
if (!is.null(dim(term.weights))) { |
107 | 2x |
term.weights <- term.weights[, vapply( |
108 | 2x |
seq_len(ncol(term.weights)), |
109 | 2x |
function(col) is.numeric(term.weights[, col]), TRUE |
110 |
)] |
|
111 |
} |
|
112 | 9x |
} else if (any(su <- vapply(seq_len(ncol(dict)), function(col) is.numeric(dict[, col]), TRUE))) { |
113 | 9x |
term.weights <- dict[, su, drop = FALSE] |
114 | 9x |
dict <- if (all(su)) { |
115 | 2x |
if (!is.null(rownames(dict))) { |
116 | 2x |
data.frame(term = rownames(dict), stringsAsFactors = FALSE) |
117 |
} else { |
|
118 | ! |
term.weights <- if (ncol(term.weights) == 1) NULL else term.weights[, -1, drop = FALSE] |
119 | ! |
dict[, 1, drop = FALSE] |
120 |
} |
|
121 |
} else { |
|
122 | 7x |
dict[, !su, drop = FALSE] |
123 |
} |
|
124 |
} |
|
125 | 11x |
if (!is.null(rownames(dict)) && ncol(dict) == 1 && any(grepl("^[a-z]", rownames(dict), TRUE))) { |
126 | ! |
dict <- rownames(dict) |
127 |
} else { |
|
128 | 11x |
su <- vapply(seq_len(ncol(dict)), function(col) !is.numeric(dict[, col]), TRUE) |
129 | ! |
if (!any(su)) stop("no terms found in dictionary") |
130 | 11x |
dict <- if (sum(su) > 1) { |
131 | 4x |
su <- which(su) |
132 | 4x |
if (!is.null(term.weights) && (!is.list(term.weights) || ncol(term.weights) == 1)) { |
133 | 3x |
if (is.list(term.weights)) term.weights <- term.weights[, 1] |
134 | 3x |
ssu <- vapply(su, function(col) length(unique(dict[, col])), 0) + seq(length(su), 1) |
135 | 3x |
term.weights <- split(term.weights, dict[, which.min(ssu)]) |
136 | 3x |
dict <- split(dict[, which.max(ssu)], dict[, which.min(ssu)]) |
137 |
} else { |
|
138 | 1x |
ssu <- vapply(su, function(col) anyDuplicated(dict[, col]) == 0, TRUE) |
139 | ! |
if (any(ssu)) dict[, su[ssu][1]] else dict[, su[1]] |
140 |
} |
|
141 |
} else { |
|
142 | 7x |
dict[, su] |
143 |
} |
|
144 |
} |
|
145 |
} |
|
146 | 94x |
if (is.numeric(dict) && is.null(term.weights)) { |
147 | 3x |
term.weights <- dict |
148 | 3x |
dict <- names(dict) |
149 |
} |
|
150 | ! |
if (is.factor(dict)) dict <- as.character(dict) |
151 | 94x |
if (!is.null(dim(term.weights))) { |
152 | 1x |
if (is.null(colnames(term.weights))) colnames(term.weights) <- if (length(dict) == length(term.weights)) names(dict) else paste0("cat", seq_len(ncol(term.weights))) |
153 | 1x |
if (!is.data.frame(term.weights)) term.weights <- as.data.frame(term.weights, stringsAsFactors = FALSE) |
154 | 10x |
su <- vapply(term.weights, is.numeric, TRUE) |
155 | 10x |
if (any(!su)) { |
156 | 1x |
if (any(ssu <- !su & vapply(term.weights, anyDuplicated, 0) == 0)) { |
157 | 1x |
rownames(term.weights) <- term.weights[, which(ssu)[1]] |
158 |
} |
|
159 | 1x |
term.weights <- term.weights[, su] |
160 |
} |
|
161 | ! |
if (!length(term.weights)) stop("no numeric columns in term.weights") |
162 |
} |
|
163 | 94x |
if (!is.list(dict)) { |
164 | 32x |
dict <- if (is.matrix(dict)) { |
165 | ! |
as.data.frame(dict, stringsAsFactors = FALSE) |
166 | 32x |
} else if (is.character(dict) && length(dict) == 1 && (file.exists(dict) || dict %in% rownames(select.dict()$info))) { |
167 | ! |
read.dic(dict, dir = if (ckd) "" else dir) |
168 |
} else { |
|
169 | 32x |
list(dict) |
170 |
} |
|
171 |
} |
|
172 | 94x |
if (is.list(dict)) { |
173 | 94x |
if (is.null(names(dict))) { |
174 | 44x |
tn <- if (!is.null(colnames(term.weights))) colnames(term.weights) else names(term.weights) |
175 | 44x |
names(dict) <- if (!is.null(tn) && length(tn) == length(dict)) tn else paste0("cat", seq_along(dict)) |
176 | 50x |
} else if (any(su <- names(dict) == "")) { |
177 | 2x |
names(dict)[su] <- if (sum(su) == 1) "cat_unnamed" else paste0("cat_unnamed", seq_len(sum(su))) |
178 | 2x |
if (!is.null(term.weights) && any(su <- names(term.weights) == "")) { |
179 | 2x |
names(term.weights)[su] <- if (sum(su) == 1) "cat_unnamed" else paste0("cat_unnamed", seq_len(sum(su))) |
180 |
} |
|
181 |
} |
|
182 |
} else { |
|
183 | ! |
dict <- list(dict) |
184 |
} |
|
185 | 94x |
if (!is.null(term.weights)) { |
186 | 28x |
if (is.null(dim(term.weights))) { |
187 | 18x |
if (is.list(term.weights)) { |
188 | ! |
if (length(dict) != length(term.weights) && !is.null(names(term.weights[[1]]))) dict <- term.weights |
189 | 8x |
if (length(dict) == length(term.weights) && !all(names(dict) %in% names(term.weights))) { |
190 | 1x |
if (is.null(names(term.weights)) || !any(names(term.weights) %in% names(dict))) { |
191 | 1x |
names(term.weights) <- names(dict) |
192 |
} else { |
|
193 | ! |
for (cat in names(dict)[!names(dict) %in% names(term.weights)]) { |
194 | ! |
term.weights[cat] <- structure(rep(1, length(dict[[cat]])), names = dict[[cat]]) |
195 |
} |
|
196 |
} |
|
197 |
} |
|
198 | 8x |
for (cat in names(dict)) { |
199 | 16x |
if (is.null(names(term.weights[[cat]]))) { |
200 | 14x |
if (length(term.weights[[cat]]) == length(dict[[cat]])) { |
201 | 14x |
names(term.weights[[cat]]) <- dict[[cat]] |
202 |
} else { |
|
203 | ! |
term.weights[[cat]] <- structure(rep(1, length(dict[[cat]])), names = dict[[cat]]) |
204 |
} |
|
205 |
} |
|
206 |
} |
|
207 |
} else { |
|
208 | 10x |
if (is.null(names(term.weights))) { |
209 | 7x |
if (length(dict[[1]]) == length(term.weights)) { |
210 | 7x |
term.weights <- list(term.weights) |
211 | 7x |
names(term.weights) <- names(dict) |
212 | 7x |
names(term.weights[[1]]) <- dict[[1]] |
213 |
} else { |
|
214 | ! |
term.weights <- NULL |
215 | ! |
warning("term.weights were dropped as they could not be aligned with dict") |
216 |
} |
|
217 |
} |
|
218 |
} |
|
219 |
} else { |
|
220 | 10x |
if (length(dict) == 1 && length(dict[[1]]) == nrow(term.weights) && |
221 | 10x |
!any(grepl("[a-z]", rownames(term.weights), TRUE))) { |
222 | ! |
if (is.factor(dict[[1]])) dict[[1]] <- as.character(dict[[1]]) |
223 | 7x |
if (anyDuplicated(dict[[1]])) { |
224 | 1x |
dt <- unique(dict[[1]][duplicated(dict[[1]])]) |
225 | 1x |
su <- dict[[1]] %in% dt |
226 | 1x |
td <- term.weights[su, ] |
227 | 1x |
tw <- matrix(0, length(dt), ncol(term.weights), dimnames = list(dt, colnames(term.weights))) |
228 | 1x |
for (term in dt) tw[term, ] <- colMeans(term.weights[dict[[1]] == term, , drop = FALSE], na.rm = TRUE) |
229 | 1x |
term.weights <- rbind(term.weights[!su, ], tw) |
230 | 1x |
rownames(term.weights) <- c(dict[[1]][!su], dt) |
231 | 1x |
dict[[1]] <- rownames(term.weights) |
232 |
} else { |
|
233 | 6x |
rownames(term.weights) <- dict[[1]] |
234 |
} |
|
235 |
} |
|
236 |
} |
|
237 | 28x |
if (!is.null(term.weights)) { |
238 | 3x |
if (!is.list(term.weights)) term.weights <- list(term.weights) |
239 | 28x |
dlen <- length(dict) |
240 | 28x |
if (is.null(names(term.weights))) { |
241 | 3x |
names(term.weights) <- if (length(term.weights) == dlen) names(dict) else seq_along(term.weights) |
242 |
} |
|
243 | 28x |
if (length(term.weights) > dlen && dlen == 1 && all(vapply(term.weights, length, 0) == length(dict[[1]]))) { |
244 | 10x |
dict <- lapply(term.weights, function(ws) dict[[1]]) |
245 |
} |
|
246 |
} |
|
247 |
} |
|
248 | 94x |
dict <- lapply(dict, function(cat) { |
249 | 404x |
if (!is.character(cat)) { |
250 | 2x |
if (is.null(names(cat))) as.character(cat) else names(cat) |
251 |
} else { |
|
252 | 402x |
cat |
253 |
} |
|
254 |
}) |
|
255 | 94x |
if (!is.null(bias) && is.null(names(bias))) { |
256 | 3x |
names(bias) <- if (length(bias) == length(dict)) names(dict) else seq_along(bias) |
257 |
} |
|
258 | 28x |
if (!is.null(names(term.weights)) && length(names(term.weights)) == length(dict)) names(dict) <- names(term.weights) |
259 | 94x |
for (n in names(dict)) { |
260 | 404x |
if (!n %in% names(bias) && any(ii <- !is.na(dict[[n]]) & dict[[n]] == bias.name)) { |
261 | 16x |
bias[n] <- term.weights[[n]][ii] |
262 | 16x |
term.weights[[n]][ii] <- 0 |
263 |
} |
|
264 |
} |
|
265 | 94x |
dict_chars <- list( |
266 | 94x |
all = paste(unique(strsplit(paste0(unique(unlist(dict, use.names = FALSE)), collapse = ""), "")[[1]]), |
267 | 94x |
collapse = "" |
268 |
) |
|
269 |
) |
|
270 | 94x |
dict_chars$alpha <- gsub("[^A-Za-z]", "", dict_chars$all) |
271 | 94x |
dict_chars$case <- if (grepl("[A-Z]", dict_chars$alpha)) { |
272 | 1x |
if (grepl("[a-z]", dict_chars$alpha)) "mixed" else "upper" |
273 |
} else { |
|
274 | 89x |
"lower" |
275 |
} |
|
276 | 94x |
edtm <- substitute(dtm) |
277 | 1x |
if (is.factor(dtm)) dtm <- as.character(dtm) |
278 | 94x |
if (is.character(dtm) || !any(grepl("\\s", colnames(dtm)))) { |
279 | 94x |
if (any(grepl("\\s", unlist(dict, use.names = FALSE)))) { |
280 | 3x |
if (is.character(dtm)) { |
281 | 3x |
warning( |
282 | 3x |
"dict has terms with spaces, so using lma_patcat instead;", |
283 | 3x |
"\n enter a dtm (e.g., lma_dtm(", paste0(edtm, collapse = ""), ")) to force lma_termcat use" |
284 |
) |
|
285 | 3x |
args <- list(text = dtm, dict = dict) |
286 | 1x |
if (!is.null(term.weights)) args$pattern.weights <- term.weights |
287 | 1x |
if (!is.null(bias)) args$bias <- bias |
288 | 1x |
if (!missing(glob)) args$globtoregex <- glob |
289 | 1x |
if (!missing(partial) && !partial) args$boundary <- "\\b" |
290 | 3x |
if (!missing(dir)) args$dir <- if (ckd) "" else dir |
291 | 3x |
return(do.call(lma_patcat, args)) |
292 |
} |
|
293 |
} |
|
294 | 91x |
if (is.character(dtm)) { |
295 | 1x |
if (dict_chars$case == "upper") dtm <- toupper(dtm) |
296 | 10x |
dtm <- lma_dtm(dtm, |
297 | 10x |
numbers = grepl("[0-9]", dict_chars$all), punct = grepl('[_/\\?!."-]', dict_chars$all), |
298 | 10x |
to.lower = dict_chars$case == "lower" |
299 |
) |
|
300 |
} |
|
301 |
} |
|
302 | ! |
if (is.null(dim(dtm))) dtm <- t(dtm) |
303 | 91x |
ats <- attributes(dtm)[c("opts", "WC", "type")] |
304 | 91x |
ats <- ats[!vapply(ats, is.null, TRUE)] |
305 | 91x |
atsn <- names(ats) |
306 | 91x |
ws <- if (is.null(term.filter)) colnames(dtm) else gsub(term.filter, "", colnames(dtm), perl = TRUE) |
307 | 91x |
if ((missing(to.lower) || !is.logical(to.lower)) && dict_chars$case != "mixed") { |
308 | 86x |
text_case <- if (any(grepl("[A-Z]", ws))) if (any(grepl("[a-z]", ws))) "mixed" else "upper" else "lower" |
309 | 86x |
if (text_case == "upper") { |
310 | 1x |
dict <- lapply(dict, toupper) |
311 | 1x |
dict_chars$case <- "upper" |
312 |
} |
|
313 | 86x |
to.lower <- text_case == "lower" |
314 |
} |
|
315 | 91x |
if (to.lower && dict_chars$case != "lower") { |
316 | ! |
dict <- lapply(dict, tolower) |
317 | ! |
dict_chars$case <- "lower" |
318 |
} |
|
319 | 87x |
if (dict_chars$case != "mixed") ws <- (if (dict_chars$case == "lower") tolower else toupper)(ws) |
320 | 91x |
odict <- dict |
321 | 91x |
boundaries <- FALSE |
322 | 91x |
formatdict <- function(dict, collapse = "|") { |
323 | 142x |
lab <- if (!escape) { |
324 | 38x |
lab <- lapply(dict, function(l) { |
325 | 321x |
if (!any(grepl("[][)(}{]", l))) { |
326 | 318x |
return(FALSE) |
327 |
} |
|
328 | 3x |
sl <- strsplit(l, "") |
329 | 3x |
!any(grepl("\\[.+\\]|\\(.+\\)|\\{.+\\}", l)) || any(vapply( |
330 | 3x |
sl, function(cs) { |
331 | 2x |
sum(sl == "[") != sum(sl == "]") & |
332 | 2x |
sum(sl == "{") != sum(sl == "}") & |
333 | 2x |
sum(sl == "(") != sum(sl == ")") |
334 |
}, |
|
335 | 3x |
TRUE |
336 |
)) |
|
337 |
}) |
|
338 | 38x |
Filter(isTRUE, lab) |
339 |
} else { |
|
340 | 104x |
logical() |
341 |
} |
|
342 | 142x |
if (!partial) { |
343 | 105x |
s <- "^" |
344 | 105x |
e <- "$" |
345 |
} else { |
|
346 | 37x |
s <- e <- "" |
347 |
} |
|
348 | 142x |
rec <- "([][)(}{*.^$+?\\|\\\\])" |
349 | 142x |
if (length(lab)) { |
350 | 1x |
for (l in names(lab)) dict[[l]][lab[[l]]] <- gsub("([][)(}{])", "\\\\\\1", dict[[l]][lab[[l]]]) |
351 | 1x |
rec <- "([*.^$+?\\|])" |
352 |
} |
|
353 | 142x |
res <- if (escape) { |
354 | 104x |
lapply(dict, function(l) { |
355 | 129x |
paste0(s, gsub(rec, "\\\\\\1", l, perl = TRUE), e, collapse = collapse) |
356 |
}) |
|
357 |
} else { |
|
358 | 38x |
lapply(dict, function(l) paste(paste0(s, gsub("([+*])[+*]+", "\\\\\\1+", l), e), collapse = collapse)) |
359 |
} |
|
360 | 142x |
if (glob) { |
361 | 101x |
lapply(res, function(l) { |
362 | 126x |
gsub(paste0( |
363 | 126x |
if (s == "^") "\\" else "", s, if (escape) "\\\\" else "", "\\*|", if (escape) "\\\\" else "", "\\*", if (e == "$") { |
364 |
"\\" |
|
365 |
} else { |
|
366 |
"" |
|
367 | 126x |
}, e |
368 | 126x |
), "", l) |
369 |
}) |
|
370 |
} else { |
|
371 | 41x |
res |
372 |
} |
|
373 |
} |
|
374 | 91x |
for (l in dict) { |
375 | 121x |
if (!boundaries) boundaries <- !any(grepl("^\\*|\\*$", l)) && any(grepl("^\\^|\\$$", l)) |
376 | 35x |
if (missing(partial) && boundaries) partial <- TRUE |
377 | 40x |
if (missing(glob) && (any(grepl("([][}{.^$+?\\|\\\\])", l)) || any(grepl("\\w\\*\\w", l)))) glob <- FALSE |
378 | 401x |
if (missing(escape) && (boundaries || any(grepl("[.])][+*]|[.+*]\\?|\\[\\^", l))) && |
379 | 401x |
!any(grepl("[({[][^])}]*$|^[^({[]*[])}]", l))) { |
380 | 36x |
escape <- FALSE |
381 |
} |
|
382 |
} |
|
383 | 91x |
cls <- 0 |
384 | 91x |
if (is.null(term.weights)) { |
385 | 64x |
cls <- structure(numeric(length(dict)), names = names(dict)) |
386 | 64x |
for (cat in seq_along(dict)) { |
387 | 356x |
ccls <- tryCatch(nchar(dict[[cat]]), error = function(e) NULL) |
388 | 356x |
if (is.null(ccls)) { |
389 | ! |
warning( |
390 | ! |
"dict appears to be misencoded, so results may not be as expected;\n", |
391 | ! |
'might try reading the dictionary in with encoding = "latin1"' |
392 |
) |
|
393 | ! |
dict[[cat]] <- iconv(dict[[cat]], sub = "#") |
394 | ! |
ccls <- nchar(dict[[cat]]) |
395 |
} |
|
396 | 356x |
cls[cat] <- sum(ccls) |
397 |
} |
|
398 |
} |
|
399 | 91x |
if (any(cls > term.break)) { |
400 | 3x |
br <- function(l, e = term.break) { |
401 | 3x |
f <- ceiling(cls[[l]] / e) |
402 | 3x |
l <- length(dict[[l]]) |
403 | 3x |
e <- ceiling(l / f) |
404 | 3x |
o <- lapply(seq_len(f), function(i) seq_len(e) + e * (i - 1)) |
405 | 3x |
o[[f]] <- o[[f]][o[[f]] <= l] |
406 | 3x |
o |
407 |
} |
|
408 | 3x |
op <- matrix(0, nrow(dtm), length(dict), dimnames = list(rownames(dtm), names(dict))) |
409 | ! |
if (coverage) cop <- op |
410 | 3x |
for (cat in names(dict)) { |
411 | 5x |
matches <- if (cls[[cat]] > term.break) { |
412 | 3x |
unique(unlist(lapply(br(cat), function(s) { |
413 | 52x |
grep(formatdict(list(dict[[cat]][s]))[[1]], ws, perl = TRUE) |
414 |
}))) |
|
415 |
} else { |
|
416 | 2x |
grep(formatdict(list(dict[[cat]])), ws, perl = TRUE) |
417 |
} |
|
418 | 5x |
if (length(matches)) { |
419 | 5x |
su <- dtm[, matches, drop = FALSE] |
420 | 5x |
op[, cat] <- rowSums(su, na.rm = TRUE) |
421 | ! |
if (coverage) cop[, cat] <- rowSums(su != 0, na.rm = TRUE) |
422 |
} |
|
423 |
} |
|
424 | 3x |
if (coverage) { |
425 | ! |
colnames(cop) <- paste0("coverage_", colnames(op)) |
426 | ! |
op <- cbind(op, cop) |
427 |
} |
|
428 |
} else { |
|
429 | 88x |
if (!is.null(term.weights)) { |
430 | 27x |
dict <- formatdict(dict, NULL) |
431 | 27x |
terms <- unique(unlist(dict)) |
432 | 27x |
termmap <- lapply(terms, grep, ws, perl = TRUE, value = TRUE) |
433 | 27x |
names(termmap) <- unique(unlist(odict)) |
434 | 27x |
termmap <- Filter(length, termmap) |
435 | 27x |
if (is.null(dim(term.weights))) { |
436 | 17x |
op <- matrix(0, nrow(dtm), length(dict), dimnames = list(rownames(dtm), names(dict))) |
437 | 17x |
if (length(termmap)) { |
438 | 16x |
weights <- lapply(names(term.weights), function(n) { |
439 | 25x |
l <- term.weights[[n]] |
440 | 25x |
if (is.null(names(l)) && n %in% names(dict) && length(dict[[n]]) == length(l)) { |
441 | 1x |
names(term.weights[[n]]) <- dict[[n]] |
442 | 1x |
l <- term.weights[[n]] |
443 |
} |
|
444 | 13x |
if (any(su <- !names(termmap) %in% names(l))) l[names(termmap)[su]] <- 0 |
445 | 25x |
do.call(c, lapply(names(termmap), function(p) { |
446 | 706x |
structure(rep(l[[p]], length(termmap[[p]])), names = termmap[[p]]) |
447 |
})) |
|
448 |
}) |
|
449 | 16x |
names(weights) <- names(term.weights) |
450 | 16x |
for (cat in names(dict)) { |
451 | 25x |
if (length(weights[[cat]])) { |
452 | 25x |
op[, cat] <- as.numeric(dtm[, names(weights[[cat]]), drop = FALSE] %*% weights[[cat]]) |
453 |
} |
|
454 |
} |
|
455 |
} |
|
456 |
} else { |
|
457 | 10x |
if (length(termmap)) { |
458 | 10x |
weights <- do.call(rbind, lapply(names(termmap), function(p) { |
459 | 65x |
matrix( |
460 | 65x |
rep(as.numeric(term.weights[p, ]), length(termmap[[p]])), |
461 | 65x |
ncol = ncol(term.weights), dimnames = list(termmap[[p]], colnames(term.weights)) |
462 |
) |
|
463 |
})) |
|
464 | 10x |
op <- matrix(0, nrow(dtm), ncol(weights), dimnames = list(rownames(dtm), colnames(weights))) |
465 | 10x |
for (cat in colnames(op)) { |
466 | 19x |
op[, cat] <- as.numeric(dtm[, rownames(weights), drop = FALSE] %*% weights[, cat]) |
467 |
} |
|
468 |
} else { |
|
469 | ! |
op <- matrix(0, nrow(dtm), length(dict), dimnames = list(rownames(dtm), colnames(weights))) |
470 |
} |
|
471 |
} |
|
472 |
} else { |
|
473 | 61x |
dict <- formatdict(dict) |
474 | 61x |
if (coverage) { |
475 | 1x |
op <- vapply(names(dict), function(cat) { |
476 | 3x |
su <- dtm[, grep(dict[[cat]], ws, perl = TRUE), drop = FALSE] |
477 | 3x |
c(rowSums(su != 0, na.rm = TRUE), rowSums(su, na.rm = TRUE)) |
478 | 1x |
}, numeric(nrow(dtm) * 2)) |
479 | 1x |
cop <- op[seq_len(nrow(dtm)), , drop = FALSE] |
480 | 1x |
colnames(cop) <- paste0("coverage_", names(dict)) |
481 | 1x |
op <- cbind(op[-seq_len(nrow(dtm)), , drop = FALSE], cop) |
482 |
} else { |
|
483 | 60x |
op <- vapply(names(dict), function(cat) { |
484 | 348x |
rowSums(dtm[, grep(dict[[cat]], ws, perl = TRUE), |
485 | 348x |
drop = FALSE |
486 | 348x |
], na.rm = TRUE) |
487 | 60x |
}, numeric(nrow(dtm))) |
488 |
} |
|
489 | 61x |
if (nrow(dtm) == 1) { |
490 | 7x |
op <- t(op) |
491 | 7x |
rownames(op) <- 1 |
492 |
} |
|
493 |
} |
|
494 |
} |
|
495 | 10x |
if (!is.null(bias)) for (n in names(bias)) if (n %in% colnames(op)) op[, n] <- op[, n] + bias[[n]] |
496 | 91x |
attr(op, "WC") <- if ("WC" %in% atsn) { |
497 | 43x |
ats$WC |
498 | 91x |
} else if (all(vapply(seq_len(ncol(dtm)), function(i) { |
499 | 3413x |
is.numeric(dtm[, i]) || is.integer(dtm[, i]) |
500 | 91x |
}, TRUE))) { |
501 | 48x |
rowSums(dtm, na.rm = TRUE) |
502 |
} else { |
|
503 | ! |
NULL |
504 |
} |
|
505 | 91x |
attr(op, "time") <- c(attr(dtm, "time"), termcat = proc.time()[[3]] - st) |
506 | 51x |
if ("type" %in% atsn) attr(op, "type") <- ats$type |
507 | 91x |
op |
508 |
} |
1 |
#' Read/Write Dictionary Files |
|
2 |
#' |
|
3 |
#' Read in or write dictionary files in Comma-Separated Values (.csv; weighted) or |
|
4 |
#' Linguistic Inquiry and Word Count (.dic; non-weighted) format. |
|
5 |
#' @param path Path to a file, a name corresponding to a file in \code{getOption('lingmatch.dict.dir')} |
|
6 |
#' (or \code{'~/Dictionaries'}) or one of the dictionaries available at \href{https://osf.io/y6g5b}{osf.io/y6g5b}, |
|
7 |
#' a matrix-like object to be categorized, or a list to be formatted. |
|
8 |
#' @param cats A character vector of category names to be returned. All categories are returned by default. |
|
9 |
#' @param type A character indicating whether and how terms should be altered. Unspecified or matching 'asis' |
|
10 |
#' leaves terms as they are. Other options change wildcards to regular expressions: |
|
11 |
#' \code{'pattern'} (\code{'^[poi]'}) replaces initial asterisks with \code{'\\\\b\\\\w*'}, |
|
12 |
#' and terminal asterisks with \code{'\\\\w*\\\\b'}, to match terms within raw text; |
|
13 |
#' for anything else, terms are padded with \code{^} and \code{$}, then those bounding marks are removed |
|
14 |
#' when an asterisk is present, to match tokenized terms. |
|
15 |
#' @param as.weighted Logical; if \code{TRUE}, prevents weighted dictionaries from being converted to |
|
16 |
#' unweighted versions, or converts unweighted dictionaries to a binary weighted version |
|
17 |
#' -- a data.frame with a "term" column of unique terms, and a column for each category. |
|
18 |
#' @param dir Path to a folder containing dictionaries, or where you would like dictionaries to be downloaded; |
|
19 |
#' passed to \code{\link{select.dict}} and/or \code{\link{download.dict}}. |
|
20 |
#' @param ... Passes arguments to \code{\link{readLines}}. |
|
21 |
#' @param term.name,category.name Strings identifying column names in \code{path} containing terms and categories |
|
22 |
#' respectively. |
|
23 |
#' @param raw Logical or a character. As logical, indicates if \code{path} should be treated |
|
24 |
#' as a raw dictionary (as might be read in from a .dic file). As a character, replaces \code{path} |
|
25 |
#' as if it were read in from a file. |
|
26 |
#' @return \code{read.dic}: A \code{list} (unweighted) with an entry for each category containing |
|
27 |
#' character vectors of terms, or a \code{data.frame} (weighted) with columns for terms (first, "term") and |
|
28 |
#' weights (all subsequent, with category labels as names). |
|
29 |
#' @family Dictionary functions |
|
30 |
#' @importFrom utils read.table write.table |
|
31 |
#' @export |
|
32 | ||
33 |
read.dic <- function(path, cats = NULL, type = "asis", as.weighted = FALSE, dir = getOption("lingmatch.dict.dir"), ..., |
|
34 |
term.name = "term", category.name = "category", raw = FALSE) { |
|
35 | 60x |
if (ckd <- dir == "") dir <- "~/Dictionaries" |
36 | 62x |
if (!is.logical(raw)) { |
37 | 1x |
path <- raw |
38 | 1x |
raw <- TRUE |
39 |
} |
|
40 | ! |
if (missing(path)) path <- file.choose() |
41 | 62x |
if (!raw) { |
42 | 61x |
if (is.character(path) && !any(file.exists(path)) && |
43 | 61x |
any(file.exists(normalizePath(paste0(dir, "/", path), "/", FALSE)))) { |
44 | 2x |
path <- normalizePath(paste0(dir, "/", path), "/", FALSE) |
45 |
} |
|
46 | 61x |
if (is.character(path) && !any(file.exists(path))) { |
47 | 3x |
tp <- select.dict(path, dir = if (ckd) "" else dir) |
48 | 3x |
if (nrow(tp$selected) && length(path) <= nrow(tp$info)) { |
49 | 2x |
if (any(tp$selected$downloaded == "")) { |
50 | ! |
td <- rownames(tp$selected)[tp$selected$downloaded == ""] |
51 | ! |
if (!ckd && grepl("^$|^[yt1]|^ent", readline(paste0( |
52 | ! |
"would you like to download ", if (length(td) == 1) "this dictionary" else "these dictionaries", "?:\n", |
53 | ! |
sub(",(?=[^,]+$)", if (length(td) == 2) " and" else ", and", paste0(td, collapse = ", "), perl = TRUE), |
54 | ! |
"\n(press Enter for yes): " |
55 |
)))) { |
|
56 | ! |
tp$selected[td, "downloaded"] <- unlist(download.dict(td, dir = dir), use.names = FALSE) |
57 |
} |
|
58 |
} |
|
59 | 2x |
path <- tp$selected[tp$selected[, "downloaded"] != "", "downloaded"] |
60 | 2x |
if (!length(path)) { |
61 | ! |
stop( |
62 | ! |
if (nrow(tp$selected) == 1) "dictionary" else "dictionaries", " (", |
63 | ! |
paste(rownames(tp$selected), collapse = ", "), ") not found in dir (", dir, ")", |
64 | ! |
if (ckd) '\nspecify a directory (e.g., dir = "~") to locate or download; see ?download.dict', |
65 | ! |
call. = FALSE |
66 |
) |
|
67 |
} |
|
68 |
} |
|
69 |
} |
|
70 | 61x |
if (is.character(path) && length(path) > 1 && any(file.exists(path))) { |
71 | 2x |
dicts <- list() |
72 | 2x |
for (p in path) { |
73 | 4x |
if (file.exists(p)) { |
74 | 4x |
name <- gsub("^.*[/\\]+|\\.[^.]+$", "", p) |
75 | 4x |
dicts[[name]] <- read.dic(p) |
76 |
} |
|
77 |
} |
|
78 | 2x |
path <- if (length(dicts) == 1) dicts[[1]] else dicts |
79 |
} |
|
80 |
} |
|
81 | 62x |
if (!is.null(dim(path))) { |
82 | 1x |
if (anyNA(path)) path[is.na(path)] <- 0 |
83 | 33x |
cols <- colnames(path) |
84 | 33x |
if (term.name %in% cols) { |
85 | 23x |
terms <- path[, term.name] |
86 | 23x |
cols <- cols[cols != term.name] |
87 | 10x |
} else if (!is.null(rownames(path)) && any(grepl("[a-z]", rownames(path), TRUE))) { |
88 | 1x |
terms <- rownames(path) |
89 |
} else { |
|
90 | 9x |
su <- which(vapply(cols, function(col) !is.numeric(path[, col]), TRUE)) |
91 | 9x |
if (!length(su)) { |
92 | 2x |
if (!is.null(colnames(path))) { |
93 | 1x |
path <- data.frame(term = colnames(path), t(path), stringsAsFactors = FALSE) |
94 | 1x |
terms <- path$term |
95 | 1x |
cols <- colnames(path)[-1] |
96 | 1x |
if (missing(as.weighted)) as.weighted <- TRUE |
97 |
} else { |
|
98 | 1x |
stop("no non-numeric columns found in path") |
99 |
} |
|
100 |
} else { |
|
101 | 7x |
if (length(su) > 1) { |
102 | 4x |
ssu <- vapply(su, function(col) { |
103 | 9x |
if (!anyDuplicated(path[, col])) { |
104 | 6x |
1 |
105 | 2x |
} else if (all(path[, col] == path[1, col])) 0 else 2 |
106 | 4x |
}, 0) |
107 | 4x |
if (length(su) == ncol(path) && !any(ssu == 0)) { |
108 | 2x |
path <- data.frame( |
109 | 2x |
term = unlist(path[, su], use.names = FALSE), |
110 | 2x |
category = rep(colnames(path), each = nrow(path)) |
111 |
) |
|
112 | 2x |
category.name <- cols <- "category" |
113 | 2x |
su <- 1 |
114 |
} else { |
|
115 | 2x |
if (any(ssu == 2)) { |
116 | ! |
cols <- colnames(path)[su[which(ssu == 2)]] |
117 | ! |
if (length(cols) > 1 && length(su) != ncol(path)) cols <- cols[1] |
118 |
} |
|
119 | 2x |
su <- if (any(ssu == 1)) su[which(ssu == 1)[[1]]] else su[[1]] |
120 |
} |
|
121 |
} |
|
122 | 7x |
terms <- path[, su] |
123 |
} |
|
124 |
} |
|
125 | 32x |
if (category.name %in% colnames(path)) { |
126 | 4x |
su <- which(vapply(cols, function(col) is.numeric(path[, col]), TRUE)) |
127 | 4x |
cols <- path[, category.name] |
128 | 4x |
if (length(su) == 1) { |
129 | 1x |
weights <- path[, names(su)] |
130 | 1x |
wl <- data.frame(term = terms) |
131 | 1x |
v <- numeric(nrow(path)) |
132 | 1x |
for (cat in unique(cols)) { |
133 | 2x |
su <- cols == cat |
134 | 2x |
v[su] <- weights[su] |
135 | 2x |
wl[, cat] <- v |
136 | 2x |
v[] <- 0 |
137 |
} |
|
138 |
} else { |
|
139 | 3x |
wl <- split(terms, cols) |
140 |
} |
|
141 |
} else { |
|
142 | 28x |
su <- vapply(cols, function(col) { |
143 | 78x |
if (is.numeric(path[, col])) { |
144 | 70x |
1 |
145 | 8x |
} else if (anyDuplicated(path[, col])) { |
146 | 1x |
if (!all(path[, col] == path[1, col])) 2 else 3 |
147 |
} else { |
|
148 | 7x |
0 |
149 |
} |
|
150 | 28x |
}, 0) |
151 | 28x |
wl <- if (!1 %in% su && any(su == 2) && any(su != 2)) { |
152 | ! |
cols <- cols[su == 2] |
153 | ! |
wl <- lapply(cols, function(col) split(terms, path[, col])) |
154 | ! |
names(wl) <- cols |
155 | ! |
wl |
156 |
} else { |
|
157 | 28x |
if (!any(su == 1)) { |
158 | 1x |
if (any(su > 1)) { |
159 | 1x |
cols <- cols[su > 1] |
160 | 1x |
if (length(cols) == 1) { |
161 | 1x |
split(terms, path[, cols]) |
162 |
} else { |
|
163 | ! |
wl <- lapply(cols, function(col) split(terms, path[, col])) |
164 | ! |
names(wl) <- cols |
165 | ! |
unlist(wl, FALSE) |
166 |
} |
|
167 |
} else { |
|
168 | ! |
stop("no numeric columns found in path") |
169 |
} |
|
170 | 27x |
} else if (as.weighted) { |
171 | 13x |
cbind(term = terms, path[, cols[su == 1], drop = FALSE]) |
172 |
} else { |
|
173 | 14x |
cols <- cols[su == 1] |
174 | 14x |
if (length(cols) == 1) { |
175 | 4x |
weights <- path[, cols] |
176 | 4x |
if (any(weights < 0) && any(weights > 0)) { |
177 | 3x |
Filter(length, list( |
178 | 3x |
positive = terms[weights > 0], |
179 | 3x |
neutral = terms[weights == 0], |
180 | 3x |
negative = terms[weights < 0] |
181 |
)) |
|
182 | ! |
} else if (anyDuplicated(weights)) split(terms, weights) else list(category = terms) |
183 |
} else { |
|
184 | 10x |
weights <- as.data.frame(path[, cols], stringsAsFactors = FALSE) |
185 | 10x |
if (any(weights > 0) && any(weights < 0)) { |
186 | 3x |
for (col in cols) { |
187 | 6x |
if (any(weights[, col] > 0)) weights[, paste0(col, ".positive")] <- as.integer(weights[, col] > 0) |
188 | 4x |
if (any(weights[, col] == 0)) weights[, paste0(col, ".neutral")] <- as.integer(weights[, col] == 0) |
189 | 4x |
if (any(weights[, col] < 0)) weights[, paste0(col, ".negative")] <- as.integer(weights[, col] < 0) |
190 | 6x |
weights <- weights[, colnames(weights) != col] |
191 |
} |
|
192 | 3x |
cols <- sort(colnames(weights)) |
193 |
} |
|
194 | 10x |
lvs <- sort(unique(unlist(weights))) |
195 | 10x |
if (length(lvs) == 2 && all(lvs == c(0, 1))) { |
196 | 8x |
wl <- lapply(cols, function(col) terms[weights[, col] == 1]) |
197 | 8x |
names(wl) <- cols |
198 | 8x |
wl <- Filter(length, wl) |
199 |
} else { |
|
200 | 2x |
wl <- split(terms, colnames(weights)[max.col(weights, "first")]) |
201 |
} |
|
202 | 10x |
wl |
203 |
} |
|
204 |
} |
|
205 |
} |
|
206 |
} |
|
207 | 29x |
} else if (is.list(path)) { |
208 | 13x |
path <- Filter(length, path) |
209 | 13x |
if (all(vapply(path, function(d) is.character(d) || is.factor(d), TRUE))) { |
210 | 5x |
wl <- path |
211 | 2x |
if (is.null(names(wl))) names(wl) <- paste0("cat", seq_along(wl)) |
212 | 5x |
if (!is.null(cats)) { |
213 | 2x |
wl <- wl[names(wl) %in% cats] |
214 | ! |
if (!length(wl)) stop("no cats were found in path") |
215 |
} |
|
216 |
} else { |
|
217 | 4x |
if (is.null(names(path))) names(path) <- paste0("dict", seq_along(path)) |
218 | 8x |
wl <- if (any(vapply(path, function(d) !is.null(dim(d)), TRUE))) { |
219 | 5x |
terms <- NULL |
220 | 5x |
cols <- NULL |
221 | 5x |
for (d in names(path)) { |
222 | 10x |
path[[d]] <- read.dic(path[[d]], as.weighted = as.weighted) |
223 | 10x |
if (as.weighted) { |
224 | 2x |
terms <- unique(c(terms, path[[d]]$term)) |
225 | 2x |
cols <- c(cols, paste0(d, ".", colnames(path[[d]])[-1])) |
226 |
} |
|
227 |
} |
|
228 | 5x |
if (as.weighted) { |
229 | 1x |
wl <- as.data.frame( |
230 | 1x |
matrix(0, length(terms), length(cols), dimnames = list(terms, cols)), |
231 | 1x |
stringsAsFactors = FALSE |
232 |
) |
|
233 | 1x |
for (d in names(path)) { |
234 | 2x |
cols <- colnames(path[[d]])[-1] <- paste0(d, ".", colnames(path[[d]])[-1]) |
235 | 2x |
su <- duplicated(path[[d]]$term) |
236 | 2x |
if (any(su)) { |
237 | 1x |
su <- path[[d]]$term %in% path[[d]]$term[su] |
238 | 1x |
td <- path[[d]][su, , drop = FALSE] |
239 | 1x |
for (term in unique(td$term)) wl[term, cols] <- colMeans(td[td$term == term, cols]) |
240 |
} |
|
241 | 2x |
if (any(!su)) path[[d]] <- path[[d]][!su, ] |
242 | 2x |
rownames(path[[d]]) <- path[[d]]$term |
243 | 2x |
wl[path[[d]]$term, cols] <- path[[d]][, cols] |
244 |
} |
|
245 | 1x |
data.frame(term = rownames(wl), wl, stringsAsFactors = FALSE) |
246 |
} else { |
|
247 | 4x |
unlist(path, FALSE) |
248 |
} |
|
249 | 8x |
} else if (any(vapply(path, is.list, TRUE))) { |
250 | 1x |
unlist(path, FALSE) |
251 |
} else { |
|
252 | 2x |
if (any(vapply(path, function(d) is.null(names(d)), TRUE))) { |
253 | 1x |
if (all(vapply(path, length, 0) == length(path[[1]]))) { |
254 | 1x |
data.frame(term = names(path), do.call(rbind, path), stringsAsFactors = FALSE) |
255 |
} else { |
|
256 | ! |
stop("failed to resolve path; as a list, entries should contain character or named numeric vectors") |
257 |
} |
|
258 |
} else { |
|
259 | 1x |
terms <- unique(unlist(lapply(path, names), use.names = FALSE)) |
260 | 1x |
v <- structure(numeric(length(terms)), names = terms) |
261 | 1x |
data.frame(term = terms, vapply(path, function(d) { |
262 | 2x |
v[names(d)] <- d |
263 | 2x |
v |
264 | 1x |
}, numeric(length(terms))), stringsAsFactors = FALSE) |
265 |
} |
|
266 |
} |
|
267 |
} |
|
268 |
} else { |
|
269 | 16x |
if (raw || length(path) != 1) { |
270 | 1x |
if (length(path) == 1) path <- strsplit(path, "\n")[[1]] |
271 | 2x |
di <- path |
272 |
} else { |
|
273 | 14x |
di <- tryCatch(readLines(path, warn = FALSE, ...), error = function(e) NULL) |
274 | 1x |
if (is.null(di)) stop("assumed path (", path, ") is to a file, but failed to read it in", call. = FALSE) |
275 |
} |
|
276 | 15x |
lst <- grep("%", di, fixed = TRUE) |
277 | 15x |
if (length(lst) > 1 && !grepl(",", di[lst[1]], fixed = TRUE)) { |
278 | 7x |
if (length(lst) < 2) { |
279 | ! |
stop( |
280 | ! |
"could not identify the end of the header -- ", |
281 | ! |
"this should be the second percent sign (%) following the last category name." |
282 |
) |
|
283 |
} |
|
284 | 7x |
lst <- lst[2] |
285 | 7x |
h <- grep("^\\d", gsub("^\\s+|\\s*%+\\s*|\\s+$", "", di[seq_len(lst)]), value = TRUE) |
286 | 7x |
ci <- character() |
287 | 7x |
for (cat in h) ci[sub("\\s.*$", "", cat)] <- sub("^[^\\s]+\\s+", "", cat, perl = TRUE) |
288 | 7x |
if (is.null(cats)) cats <- ci |
289 | 7x |
sep <- if (grepl("\t", di[lst + 1], fixed = TRUE)) "\t" else "\\s" |
290 | 7x |
cb <- paste0("(?:", sep, "+(?:", paste(names(ci), collapse = "|"), ")(?=", sep, "|$))*$") |
291 | 7x |
di <- di[-seq_len(lst - 1)] |
292 | 7x |
wl <- lapply(structure(names(ci), names = ci), function(cat) { |
293 | 52x |
unique(sub(cb, "", di[grep(paste0(sep, cat, cb), di, perl = TRUE)], perl = TRUE)) |
294 |
}) |
|
295 | 7x |
wl <- wl[cats[cats %in% names(wl)]] |
296 |
} else { |
|
297 | 6x |
if (missing(as.weighted) && length(path) == 1) as.weighted <- TRUE |
298 | 8x |
wl <- if (any(grepl("[\\s,]", di, perl = TRUE))) { |
299 | 7x |
di <- read.table( |
300 | 7x |
text = di, header = TRUE, sep = if (grepl("\t", di[[1]])) "\t" else ",", |
301 | 7x |
quote = '"', comment.char = "", stringsAsFactors = FALSE |
302 |
) |
|
303 | 7x |
if (!missing(as.weighted) || (!term.name %in% colnames(di) && !any(vapply(di, is.character, TRUE)) && |
304 | 7x |
!any(grepl("[a-z]", rownames(di), TRUE)))) { |
305 | 7x |
di <- tryCatch( |
306 | 7x |
read.dic(di, cats = cats, type = type, as.weighted = as.weighted), |
307 | 7x |
error = function(e) e$message |
308 |
) |
|
309 |
} |
|
310 | 7x |
di |
311 |
} else { |
|
312 | 1x |
list(cat1 = di) |
313 |
} |
|
314 | 8x |
if (length(wl) == 1 && is.character(wl)) { |
315 | ! |
stop("assuming path is to a comma separated values file, but failed to read it in:\n", wl) |
316 |
} |
|
317 |
} |
|
318 |
} |
|
319 | 1x |
if (!missing(type) && !grepl("^[Aa]", type)) wl <- to_regex(wl, grepl("^[poi]", type, TRUE)) |
320 | 60x |
if (is.null(dim(wl)) && !is.null(cats)) { |
321 | 9x |
su <- names(wl) %in% cats |
322 | 9x |
if (any(su)) wl <- wl[su] |
323 |
} |
|
324 | 60x |
if (as.weighted && is.null(dim(wl))) { |
325 | 1x |
op <- data.frame(term = unique(unlist(wl)), stringsAsFactors = FALSE) |
326 | 1x |
for (cat in names(wl)) op[, cat] <- as.integer(op$term %in% wl[[cat]]) |
327 | 1x |
op |
328 |
} else { |
|
329 | 59x |
if (is.data.frame(wl) && !as.weighted && !missing(as.weighted)) { |
330 | ! |
read.dic(wl, cats = cats, as.weighted = FALSE) |
331 |
} else { |
|
332 | 59x |
wl |
333 |
} |
|
334 |
} |
|
335 |
} |
|
336 | ||
337 |
#' @rdname read.dic |
|
338 |
#' @param dict A \code{list} with a named entry of terms for each category, or a \code{data.frame} |
|
339 |
#' with terms in one column, and categories or weights in the rest. |
|
340 |
#' @param filename The name of the file to be saved. |
|
341 |
#' @param save Logical: if \code{FALSE}, does not write a file. |
|
342 |
#' @return \code{write.dic}: A version of the written dictionary -- a raw character vector for |
|
343 |
#' unweighted dictionaries, or a \code{data.frame} for weighted dictionaries. |
|
344 |
#' @examples |
|
345 |
#' # make a small murder related dictionary |
|
346 |
#' dict <- list( |
|
347 |
#' kill = c("kill*", "murd*", "wound*", "die*"), |
|
348 |
#' death = c("death*", "dying", "die*", "kill*") |
|
349 |
#' ) |
|
350 |
#' |
|
351 |
#' # convert it to a weighted format |
|
352 |
#' (dict_weighted <- read.dic(dict, as.weighted = TRUE)) |
|
353 |
#' |
|
354 |
#' # categorize it back |
|
355 |
#' read.dic(dict_weighted) |
|
356 |
#' |
|
357 |
#' # convert it to a string without writing to a file |
|
358 |
#' cat(raw_dict <- write.dic(dict, save = FALSE)) |
|
359 |
#' |
|
360 |
#' # parse it back in |
|
361 |
#' read.dic(raw = raw_dict) |
|
362 |
#' |
|
363 |
#' \dontrun{ |
|
364 |
#' |
|
365 |
#' # save it as a .dic file |
|
366 |
#' write.dic(dict, "murder") |
|
367 |
#' |
|
368 |
#' # read it back in as a list |
|
369 |
#' read.dic("murder.dic") |
|
370 |
#' |
|
371 |
#' # read in the Moral Foundations or LUSI dictionaries from urls |
|
372 |
#' moral_dict <- read.dic("https://osf.io/download/whjt2") |
|
373 |
#' lusi_dict <- read.dic("https://osf.io/download/29ayf") |
|
374 |
#' |
|
375 |
#' # save and read in a version of the General Inquirer dictionary |
|
376 |
#' inquirer <- read.dic("inquirer", dir = "~/Dictionaries") |
|
377 |
#' } |
|
378 |
#' @export |
|
379 | ||
380 |
write.dic <- function(dict, filename = NULL, type = "asis", as.weighted = FALSE, save = TRUE) { |
|
381 | 5x |
if (!is.list(dict) || is.data.frame(dict)) { |
382 | 2x |
if (save && (missing(as.weighted) || as.weighted)) { |
383 | 2x |
as.weighted <- TRUE |
384 | 2x |
o <- dict |
385 |
} else { |
|
386 | ! |
dict <- read.dic(dict) |
387 |
} |
|
388 |
} |
|
389 | 5x |
if (is.null(dim(dict))) { |
390 | 3x |
terms <- unique(as.character(unlist(dict, use.names = FALSE))) |
391 | 3x |
terms <- terms[terms != ""] |
392 | ! |
if (!missing(type) && !grepl("^[Aa]", type)) dict <- to_regex(dict, grepl("^[poi]", type, TRUE)) |
393 | 3x |
if (as.weighted) { |
394 | 1x |
o <- data.frame(term = terms, stringsAsFactors = FALSE) |
395 | 1x |
for (cat in names(dict)) o[, cat] <- as.integer(o$term %in% dict[[cat]]) |
396 |
} else { |
|
397 | 2x |
l <- length(dict) |
398 | 2x |
m <- as.data.frame(matrix("", length(terms) + l + 2, l + 1), stringsAsFactors = FALSE) |
399 | 2x |
m[, 1] <- c("%", seq_len(l), "%", terms) |
400 | 2x |
m[seq_len(l) + 1, 2] <- if (is.null(names(dict))) seq_len(l) else names(dict) |
401 | 2x |
for (i in seq_along(dict)) m[which(m[-seq_len(l + 2), 1] %in% dict[[i]]) + l + 2, i + 1] <- i |
402 | 2x |
o <- gsub("\t{2,}", "\t", paste(sub("\t+$", "", do.call(paste, c(m, sep = "\t"))), collapse = "\n")) |
403 |
} |
|
404 |
} |
|
405 | 5x |
if (save && is.character(filename)) { |
406 | 4x |
filename <- filename[[1]] |
407 | ! |
if (!grepl("\\.[^.]+$", filename)) filename <- paste0(filename, if (as.weighted) ".csv" else ".dic") |
408 | 4x |
if (as.weighted) { |
409 | 3x |
write.table(o, filename, sep = ",", row.names = FALSE, qmethod = "double") |
410 |
} else { |
|
411 | 1x |
write(o, filename) |
412 |
} |
|
413 | 4x |
message("dictionary saved to ", filename) |
414 |
} |
|
415 | 5x |
invisible(o) |
416 |
} |
1 |
#' Generate a Report of Term Matches |
|
2 |
#' |
|
3 |
#' Extract matches to fuzzy terms (globs/wildcards or regular expressions) from provided text, in order |
|
4 |
#' to assess their appropriateness for inclusion in a dictionary. |
|
5 |
#' @param dict A vector of terms, list of such vectors, or a matrix-like object to be |
|
6 |
#' categorized by \code{\link{read.dic}}. |
|
7 |
#' @param text A vector of text to extract matches from. If not specified, will use the terms |
|
8 |
#' in the \code{term_map} retrieved from \code{\link{select.lspace}}. |
|
9 |
#' @param space A vector space used to calculate similarities between term matches. |
|
10 |
#' Name of a the space (see \code{\link{select.lspace}}), a matrix with terms as row names, or |
|
11 |
#' \code{TRUE} to auto-select a space based on matched terms. |
|
12 |
#' @param glob Logical; if \code{TRUE}, converts globs (asterisk wildcards) to regular expressions. |
|
13 |
#' If not specified, this will be set automatically. |
|
14 |
#' @param parse_phrases Logical; if \code{TRUE} (default) and \code{space} is specified, will |
|
15 |
#' break unmatched phrases into single terms, and average across and matched vectors. |
|
16 |
#' @param tolower Logical; if \code{FALSE}, will retain \code{text}'s case. |
|
17 |
#' @param punct Logical; if \code{FALSE}, will remove punctuation markings in \code{text}. |
|
18 |
#' @param special Logical; if \code{FALSE}, will attempt to replace special characters in \code{text}. |
|
19 |
#' @param as_terms Logical; if \code{TRUE}, will treat \code{text} as terms, meaning \code{dict} |
|
20 |
#' terms will only count as matches when matching the complete text. |
|
21 |
#' @param bysentence Logical; if \code{TRUE}, will split \code{text} into sentences, and only |
|
22 |
#' consider unique sentences. |
|
23 |
#' @param as_string Logical; if \code{FALSE}, returns matches as tables rather than a string. |
|
24 |
#' @param term_map_freq Proportion of terms to include when using the term map as a source |
|
25 |
#' of terms. Applies when \code{text} is not specified. |
|
26 |
#' @param term_map_spaces Number of spaces in which a term has to appear to be included. |
|
27 |
#' Applies when \code{text} is not specified. |
|
28 |
#' @param outFile File path to write results to, always ending in \code{.csv}. |
|
29 |
#' @param space_dir Directory from which \code{space} should be loaded. |
|
30 |
#' @param verbose Logical; if \code{FALSE}, will not display status messages. |
|
31 |
#' @family Dictionary functions |
|
32 |
#' @seealso For a more complete assessment of dictionaries, see \code{\link{dictionary_meta}()}. |
|
33 |
#' |
|
34 |
#' Similar information is provided in the \href{https://miserman.github.io/dictionary_builder/}{dictionary builder} web tool. |
|
35 |
#' @note |
|
36 |
#' Matches are extracted for each term independently, so they may not align with some implementations |
|
37 |
#' of dictionaries. For instance, by default \code{\link{lma_patcat}} matches destructively, and sorts |
|
38 |
#' terms by length such that shorter terms will not match the same text and longer terms that overlap. |
|
39 |
#' Here, the match would show up for both terms. |
|
40 |
#' @returns A \code{data.frame} of results, with a row for each unique term, and the following columns: |
|
41 |
#' \itemize{ |
|
42 |
#' \item \strong{\code{term}}: The originally entered term. |
|
43 |
#' \item \strong{\code{regex}}: The converted and applied regular expression form of the term. |
|
44 |
#' \item \strong{\code{categories}}: Comma-separated category names, |
|
45 |
#' if \code{dict} is a list with named entries. |
|
46 |
#' \item \strong{\code{count}}: Total number of matches to the term. |
|
47 |
#' \item \strong{\code{max_count}}: Number of matches to the most representative |
|
48 |
#' (that with the highest average similarity) variant of the term. |
|
49 |
#' \item \strong{\code{variants}}: Number of variants of the term. |
|
50 |
#' \item \strong{\code{space}}: Name of the latent semantic space, if one was used. |
|
51 |
#' \item \strong{\code{mean_sim}}: Average similarity to the most representative variant among terms |
|
52 |
#' found in the space, if one was used. |
|
53 |
#' \item \strong{\code{min_sim}}: Minimal similarity to the most representative variant. |
|
54 |
#' \item \strong{\code{matches}}: Variants, with counts and similarity (Pearson's r) to the |
|
55 |
#' most representative term (if a space was specified). Either in the form of a comma-separated |
|
56 |
#' string or a \code{data.frame} (if \code{as_string} is \code{FALSE}). |
|
57 |
#' } |
|
58 |
#' @examples |
|
59 |
#' text <- c( |
|
60 |
#' "I am sadly homeless, and suffering from depression :(", |
|
61 |
#' "This wholesome happiness brings joy to my heart! :D:D:D", |
|
62 |
#' "They are joyous in these fearsome happenings D:", |
|
63 |
#' "I feel weightless now that my sadness has been depressed! :()" |
|
64 |
#' ) |
|
65 |
#' dict <- list( |
|
66 |
#' sad = c("*less", "sad*", "depres*", ":("), |
|
67 |
#' happy = c("*some", "happ*", "joy*", "d:"), |
|
68 |
#' self = c("i *", "my *") |
|
69 |
#' ) |
|
70 |
#' |
|
71 |
#' report_term_matches(dict, text) |
|
72 |
#' @export |
|
73 | ||
74 |
report_term_matches <- function(dict, text = NULL, space = NULL, glob = TRUE, |
|
75 |
parse_phrases = TRUE, tolower = TRUE, punct = TRUE, special = TRUE, |
|
76 |
as_terms = FALSE, bysentence = FALSE, as_string = TRUE, |
|
77 |
term_map_freq = 1, term_map_spaces = 1, outFile = NULL, |
|
78 |
space_dir = getOption("lingmatch.lspace.dir"), verbose = TRUE) { |
|
79 | ! |
if (missing(dict)) stop("dict must be specified", call. = FALSE) |
80 | 7x |
collapsed_terms <- FALSE |
81 | 7x |
if (is.null(text)) { |
82 | 1x |
term_map <- select.lspace(dir = space_dir, get.map = TRUE)$term_map |
83 | 1x |
if (is.null(term_map)) { |
84 | ! |
stop( |
85 | ! |
"term map not found; specify `space_dir` or provide text", |
86 | ! |
call. = FALSE |
87 |
) |
|
88 |
} |
|
89 | 1x |
if (term_map_freq > 0 && term_map_freq < 1) { |
90 | ! |
term_map <- term_map[seq(1, ceiling(nrow(term_map) * term_map_freq)), ] |
91 |
} |
|
92 | 1x |
if (term_map_spaces > 0 && term_map_spaces < ncol(term_map)) { |
93 | 1x |
term_map <- term_map[rowSums(term_map != 0) >= term_map_spaces, ] |
94 |
} |
|
95 | 1x |
collapsed_terms <- as_terms <- TRUE |
96 | 1x |
text <- paste(rownames(term_map), collapse = " ") |
97 |
} |
|
98 | ! |
if (is.null(text) && is.null(space)) stop("either text or space must be specified", call. = FALSE) |
99 | 7x |
st <- proc.time()[[3]] |
100 | 7x |
if (!is.null(text) && !as_terms) { |
101 | 5x |
if (verbose) cat("preparing text (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
102 | ! |
if (bysentence) text <- read.segments(text, segment.size = 1, bysentence = TRUE)$text |
103 | 5x |
if (tolower) text <- tolower(text) |
104 | ! |
if (!punct) text <- gsub("[,_:;/\\\\.?!\"()\\{}[]|\\]", " ", text) |
105 | ! |
if (!special) text <- lma_dict("special", as.function = gsub)(text) |
106 | 5x |
text <- unique(text) |
107 |
} |
|
108 | 7x |
if (verbose) cat("preparing dict (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
109 | ! |
if (!is.null(dim(dict)) || (is.character(dict) && length(dict) == 1 && file.exists(dict))) dict <- read.dic(dict) |
110 | 7x |
terms <- data.frame(term = unique(unlist(dict, use.names = FALSE))) |
111 | 7x |
if (missing(glob)) { |
112 | 7x |
glob <- any(grepl("^\\*", terms$term)) |
113 | 7x |
if (!glob) { |
114 | 2x |
glob <- any(grepl("\\*$", terms$term)) |
115 | 1x |
if (glob) glob <- !any(grepl("(?:\\\\\\w|[].})])\\*$", terms$term)) |
116 |
} |
|
117 |
} |
|
118 | 7x |
rawtext <- !as_terms || collapsed_terms |
119 | 7x |
terms$regex <- to_regex(list(terms$term), TRUE, glob)[[1]] |
120 | 7x |
terms <- terms[!is.na(terms$regex) & terms$regex != "", ] |
121 | 7x |
terms$regex <- if (rawtext) paste0("\\b", terms$regex, "\\b") else paste0("^", terms$regex, "$") |
122 | 7x |
if (is.list(dict)) { |
123 | ! |
if (is.null(names(dict))) names(dict) <- paste0("cat_", seq_along(dict)) |
124 | 5x |
categories <- character(nrow(terms)) |
125 | 5x |
for (cat in names(dict)) { |
126 | 20x |
su <- terms$term %in% dict[[cat]] |
127 | 20x |
if (any(su)) { |
128 | 20x |
ssu <- su & categories != "" |
129 | 20x |
categories[ssu] <- paste0(categories[ssu], ", ", cat) |
130 | 20x |
categories[su & categories == ""] <- cat |
131 |
} |
|
132 |
} |
|
133 | 5x |
terms$categories <- categories |
134 |
} |
|
135 | 7x |
if (verbose) cat("extracting matches (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
136 | 7x |
matches <- extract_matches(terms$regex, text, rawtext) |
137 | 7x |
has_space <- FALSE |
138 | 7x |
space_name <- NULL |
139 | 7x |
if (!is.null(space)) { |
140 | 2x |
obs <- unique(unlist(lapply(matches, names), use.names = FALSE)) |
141 | 2x |
if (is.logical(space) && space) { |
142 | 1x |
space <- select.lspace(terms = obs)$selected |
143 | 1x |
space <- if (nrow(space)) rownames(space)[1] else NULL |
144 |
} |
|
145 | 2x |
if (is.character(space)) { |
146 | 2x |
if (verbose) cat("loading space (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
147 | 2x |
space_name <- space |
148 | 2x |
space <- lma_lspace(obs, space, dir = space_dir) |
149 |
} |
|
150 | ! |
if (!nrow(space) || !any(obs %in% rownames(space))) space <- NULL |
151 | 2x |
if (is.null(space)) { |
152 | ! |
warning("failed to recognize space") |
153 |
} else { |
|
154 | 2x |
su <- obs %in% rownames(space) |
155 | 2x |
if (parse_phrases && any(!su)) { |
156 | 2x |
phrase <- grepl("[ _/-]", obs) |
157 | 2x |
if (any(phrase)) { |
158 | 2x |
if (verbose) cat("parsing phrases (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
159 | 2x |
split_parts <- strsplit(obs[phrase], "[ _/-]") |
160 | 2x |
parts <- unique(unlist(split_parts, use.names = FALSE)) |
161 | 2x |
part_vectors <- if (is.null(space_name)) { |
162 | ! |
if (any(parts %in% rownames(space))) space[parts[parts %in% rownames(space)]] else space[0, ] |
163 |
} else { |
|
164 | 2x |
lma_lspace(parts, space_name) |
165 |
} |
|
166 | 2x |
if (nrow(part_vectors)) { |
167 | 2x |
space_terms <- rownames(part_vectors) |
168 | 2x |
space_dim <- ncol(space) |
169 | 2x |
names(split_parts) <- obs[phrase] |
170 | 2x |
agg_vectors <- t(vapply(split_parts, function(p) { |
171 | 30x |
su <- p %in% space_terms |
172 | 30x |
if (any(su)) { |
173 | 30x |
colMeans(part_vectors[p[su], , drop = FALSE]) |
174 |
} else { |
|
175 | ! |
numeric(space_dim) |
176 |
} |
|
177 | 2x |
}, numeric(space_dim))) |
178 | 2x |
space <- rbind(space, agg_vectors[rowSums(agg_vectors) != 0, ]) |
179 |
} |
|
180 |
} |
|
181 |
} |
|
182 | ! |
if (is.null(space_name)) space_name <- "custom" |
183 | 2x |
has_space <- TRUE |
184 |
} |
|
185 |
} |
|
186 | 7x |
if (verbose) cat("preparing results (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
187 | 7x |
terms <- cbind(terms, do.call(rbind, lapply(matches, function(m) { |
188 | 58x |
hits <- if (length(m)) { |
189 | 6x |
if (collapsed_terms) m[!is.na(m) & m != 0] <- 1L |
190 | 37x |
if (has_space) { |
191 | 15x |
msim <- m |
192 | 15x |
if (length(m) == 1) { |
193 | 1x |
msim[] <- 1L |
194 |
} else { |
|
195 | 14x |
msim[] <- NA |
196 | 14x |
su <- names(m) %in% rownames(space) |
197 | 14x |
if (sum(su) == 1) { |
198 | ! |
msim[su] <- 1L |
199 | 14x |
} else if (any(su)) { |
200 | 14x |
ns <- names(m)[su] |
201 | 14x |
sims <- lma_simets(space[ns, ], metric = "pearson", symmetrical = TRUE) |
202 | 14x |
msim[su] <- as.numeric(sims[, which.max(colMeans(sims))]) |
203 |
} |
|
204 |
} |
|
205 | 15x |
o <- order(msim, -nchar(names(m)), decreasing = TRUE) |
206 | 15x |
m <- m[o] |
207 | 15x |
if (as_string) { |
208 | 15x |
paste(paste0(names(m), " (", if (!as_terms) paste0(m, ", "), round(msim[o], 2), ")"), collapse = ", ") |
209 |
} else { |
|
210 | ! |
list(as.data.frame(rbind(m, msim[o]))) |
211 |
} |
|
212 |
} else { |
|
213 | 22x |
m <- m[order(m, -nchar(names(m)), decreasing = TRUE)] |
214 | 22x |
if (as_string) { |
215 | 13x |
paste(paste0(names(m), if (!as_terms) paste0(" (", m, ")")), collapse = ", ") |
216 |
} else { |
|
217 | 9x |
list(t(as.data.frame(m))) |
218 |
} |
|
219 |
} |
|
220 |
} else { |
|
221 | 7x |
if (has_space) msim <- NA |
222 | 2x |
if (as_string) "" else list(data.frame()) |
223 |
} |
|
224 | 11x |
if (!as_string) rownames(hits[[1]]) <- NULL |
225 | 58x |
res <- if (collapsed_terms) { |
226 | 11x |
data.frame( |
227 | 11x |
count = length(m), |
228 | 11x |
max_count = if (all(is.na(m))) 0L else 1L, |
229 | 11x |
variants = length(m) |
230 |
) |
|
231 |
} else { |
|
232 | 47x |
data.frame( |
233 | 47x |
count = sum(m), |
234 | 47x |
max_count = if (all(is.na(m))) 0L else max(m, na.rm = TRUE), |
235 | 47x |
variants = length(m) |
236 |
) |
|
237 |
} |
|
238 | 58x |
if (has_space) { |
239 | 22x |
res$space <- space_name |
240 | 22x |
if (all(is.na(msim))) { |
241 | 7x |
res$mean_sim <- res$min_sim <- NA |
242 |
} else { |
|
243 | 15x |
res$mean_sim <- mean(msim, na.rm = TRUE) |
244 | 15x |
res$min_sim <- min(msim, na.rm = TRUE) |
245 |
} |
|
246 |
} |
|
247 | 58x |
res[["matches"]] <- hits |
248 | 58x |
res |
249 |
}))) |
|
250 | 7x |
terms <- terms[if (has_space) order(terms$mean_sim) else order(terms$variants, decreasing = TRUE), ] |
251 | 7x |
rownames(terms) <- NULL |
252 | 7x |
if (!is.null(outFile)) { |
253 | 1x |
outFile <- paste0(sub(".csv", "", outFile, fixed = TRUE), ".csv") |
254 | 1x |
if (verbose) cat("writing results: ", outFile, " (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
255 | 1x |
write.table(terms, outFile, sep = ",", row.names = FALSE, qmethod = "double") |
256 |
} |
|
257 | 7x |
if (verbose) cat("done (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
258 | 7x |
terms |
259 |
} |
1 |
#' Download Dictionaries |
|
2 |
#' |
|
3 |
#' Downloads the specified dictionaries from \href{https://osf.io/y6g5b}{osf.io/y6g5b}. |
|
4 |
#' |
|
5 |
#' @param dict One or more names of dictionaries to download, or \code{'all'} for all available. See |
|
6 |
#' \href{https://osf.io/y6g5b/wiki/home}{osf.io/y6g5b/wiki} for more information, and a list of available dictionaries. |
|
7 |
#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF, |
|
8 |
#' and compares it with that calculated from the downloaded file to check its integrity. |
|
9 |
#' @param mode A character specifying the file write mode; default is 'wb'. See |
|
10 |
#' \code{\link{download.file}}. |
|
11 |
#' @param dir Directory in which to save the dictionary; \cr default is \code{getOption('lingmatch.dict.dir')}. \cr |
|
12 |
#' This must be specified, or the option must be set -- use \code{\link{lma_initdirs}} to initialize a directory. |
|
13 |
#' @param overwrite Logical; if \code{TRUE}, will replace existing files. |
|
14 |
#' @return Path to the downloaded dictionary, or a list of such if multiple were downloaded. |
|
15 |
#' @family Dictionary functions |
|
16 |
#' @examples |
|
17 |
#' \dontrun{ |
|
18 |
#' |
|
19 |
#' download.dict("lusi", dir = "~/Dictionaries") |
|
20 |
#' } |
|
21 |
#' @export |
|
22 | ||
23 |
download.dict <- function( |
|
24 |
dict = "lusi", check.md5 = TRUE, mode = "wb", dir = getOption("lingmatch.dict.dir"), |
|
25 |
overwrite = FALSE) { |
|
26 | 1x |
download.resource( |
27 | 1x |
"dict", dict, |
28 | 1x |
check.md5 = check.md5, mode = mode, dir = dir, overwrite = overwrite |
29 |
) |
|
30 |
} |
1 |
#' Categorize Texts |
|
2 |
#' |
|
3 |
#' Categorize raw texts using a pattern-based dictionary. |
|
4 |
#' |
|
5 |
#' @param text A vector of text to be categorized. Texts are padded by 2 spaces, and potentially lowercased. |
|
6 |
#' @param dict At least a vector of terms (patterns), usually a matrix-like object with columns for terms, |
|
7 |
#' categories, and weights. |
|
8 |
#' @param pattern.weights A vector of weights corresponding to terms in \code{dict}, or the column name of |
|
9 |
#' weights found in \code{dict}. |
|
10 |
#' @param pattern.categories A vector of category names corresponding to terms in \code{dict}, or the column name of |
|
11 |
#' category names found in \code{dict}. |
|
12 |
#' @param bias A constant to add to each category after weighting and summing. Can be a vector with names |
|
13 |
#' corresponding to the unique values in \code{dict[, category]}, but is usually extracted from dict based |
|
14 |
#' on the intercept included in each category (defined by \code{name.map['intname']}). |
|
15 |
#' @param to.lower Logical indicating whether \code{text} should be converted to lowercase before processing. |
|
16 |
#' @param return.dtm Logical; if \code{TRUE}, only a document-term matrix will be returned, rather than the |
|
17 |
#' weighted, summed, and biased category values. |
|
18 |
#' @param exclusive Logical; if \code{FALSE}, each dictionary term is searched for in the original text. |
|
19 |
#' Otherwise (by default), terms are sorted by length (with longer terms being searched for first), and |
|
20 |
#' matches are removed from the text (avoiding subsequent matches to matched patterns). |
|
21 |
#' @param drop.zeros logical; if \code{TRUE}, categories or terms with no matches will be removed. |
|
22 |
#' @param boundary A string to add to the beginning and end of each dictionary term. If \code{TRUE}, |
|
23 |
#' \code{boundary} will be set to \code{' '}, avoiding pattern matches within words. By default, dictionary |
|
24 |
#' terms are left as entered. |
|
25 |
#' @param fixed Logical; if \code{FALSE}, patterns are treated as regular expressions. |
|
26 |
#' @param globtoregex Logical; if \code{TRUE}, initial and terminal asterisks are replaced with \code{\\\\b\\\\w*} |
|
27 |
#' and \code{\\\\w*\\\\b} respectively. This will also set \code{fixed} to \code{FALSE} unless fixed is specified. |
|
28 |
#' @param name.map A named character vector: |
|
29 |
#' \itemize{ |
|
30 |
#' \item \strong{\code{intname}}: term identifying category biases within the term list; |
|
31 |
#' defaults to \code{'_intercept'} |
|
32 |
#' \item \strong{\code{term}}: name of the column containing terms in \code{dict}; defaults to \code{'term'} |
|
33 |
#' } |
|
34 |
#' Missing names are added, so names can be specified positional (e.g., \code{c('_int',} \code{'terms')}), |
|
35 |
#' or only some can be specified by name (e.g., \code{c(term =} \code{'patterns')}), leaving the rest default. |
|
36 |
#' @param dir Path to a folder in which to look for \code{dict} if it is the name of a file to be passed to |
|
37 |
#' \code{\link{read.dic}}. |
|
38 |
#' @seealso For applying term-based dictionaries (to a document-term matrix) see \code{\link{lma_termcat}()}. |
|
39 |
#' @family Dictionary functions |
|
40 |
#' @return A matrix with a row per \code{text} and columns per dictionary category, or (when \code{return.dtm = TRUE}) |
|
41 |
#' a sparse matrix with a row per \code{text} and column per term. Includes a \code{WC} attribute with original |
|
42 |
#' word counts, and a \code{categories} attribute with row indices associated with each category if |
|
43 |
#' \code{return.dtm = TRUE}. |
|
44 |
#' @examples |
|
45 |
#' # example text |
|
46 |
#' text <- c( |
|
47 |
#' paste( |
|
48 |
#' "Oh, what youth was! What I had and gave away.", |
|
49 |
#' "What I took and spent and saw. What I lost. And now? Ruin." |
|
50 |
#' ), |
|
51 |
#' paste( |
|
52 |
#' "God, are you so bored?! You just want what's gone from us all?", |
|
53 |
#' "I miss the you that was too. I love that you." |
|
54 |
#' ), |
|
55 |
#' paste( |
|
56 |
#' "Tomorrow! Tomorrow--nay, even tonight--you wait, as I am about to change.", |
|
57 |
#' "Soon I will off to revert. Please wait." |
|
58 |
#' ) |
|
59 |
#' ) |
|
60 |
#' |
|
61 |
#' # make a document-term matrix with pre-specified terms only |
|
62 |
#' lma_patcat(text, c("bored?!", "i lo", ". "), return.dtm = TRUE) |
|
63 |
#' |
|
64 |
#' # get counts of sets of letter |
|
65 |
#' lma_patcat(text, list(c("a", "b", "c"), c("d", "e", "f"))) |
|
66 |
#' |
|
67 |
#' # same thing with regular expressions |
|
68 |
#' lma_patcat(text, list("[abc]", "[def]"), fixed = FALSE) |
|
69 |
#' |
|
70 |
#' # match only words |
|
71 |
#' lma_patcat(text, list("i"), boundary = TRUE) |
|
72 |
#' |
|
73 |
#' # match only words, ignoring punctuation |
|
74 |
#' lma_patcat( |
|
75 |
#' text, c("you", "tomorrow", "was"), |
|
76 |
#' fixed = FALSE, |
|
77 |
#' boundary = "\\b", return.dtm = TRUE |
|
78 |
#' ) |
|
79 |
#' |
|
80 |
#' \dontrun{ |
|
81 |
#' |
|
82 |
#' # read in the temporal orientation lexicon from the World Well-Being Project |
|
83 |
#' tempori <- read.csv(paste0( |
|
84 |
#' "https://raw.githubusercontent.com/wwbp/lexica/master/", |
|
85 |
#' "temporal_orientation/temporal_orientation_lexicon.csv" |
|
86 |
#' )) |
|
87 |
#' |
|
88 |
#' lma_patcat(text, tempori) |
|
89 |
#' |
|
90 |
#' # or use the standardized version |
|
91 |
#' tempori_std <- read.dic("wwbp_prospection", dir = "~/Dictionaries") |
|
92 |
#' |
|
93 |
#' lma_patcat(text, tempori_std) |
|
94 |
#' |
|
95 |
#' ## get scores on the same scale by adjusting the standardized values |
|
96 |
#' tempori_std[, -1] <- tempori_std[, -1] / 100 * |
|
97 |
#' select.dict("wwbp_prospection")$selected[, "original_max"] |
|
98 |
#' |
|
99 |
#' lma_patcat(text, tempori_std)[, unique(tempori$category)] |
|
100 |
#' } |
|
101 |
#' @export |
|
102 | ||
103 |
lma_patcat <- function(text, dict = NULL, pattern.weights = "weight", pattern.categories = "category", bias = NULL, |
|
104 |
to.lower = TRUE, return.dtm = FALSE, drop.zeros = FALSE, exclusive = TRUE, boundary = NULL, fixed = TRUE, |
|
105 |
globtoregex = FALSE, name.map = c(intname = "_intercept", term = "term"), dir = getOption("lingmatch.dict.dir")) { |
|
106 | 71x |
text_names <- names(text) |
107 | ! |
if (is.factor(text)) text <- as.character(text) |
108 | ! |
if (!is.character(text)) stop("enter a character vector as the first argument") |
109 | 71x |
text <- paste(" ", text, " ") |
110 | ! |
if (is.null(names(name.map)) && length(name.map) < 3) names(name.map) <- c("intname", "term")[seq_along(name.map)] |
111 | 71x |
wide <- FALSE |
112 | 2x |
if (missing(dict) && missing(pattern.weights) && missing(pattern.categories)) dict <- lma_dict() |
113 | 71x |
if (is.character(dict) && length(dict) == 1 && (file.exists(dict) || grepl("^[A-Za-z_]{3}", dict)) && |
114 | 71x |
missing(pattern.weights) && missing(pattern.categories)) { |
115 | 1x |
if (dir == "") dir <- "~/Dictionaries" |
116 | 1x |
if (!any(file.exists(dict)) && any(file.exists(normalizePath(paste0(dir, "/", dict), "/", FALSE)))) { |
117 | ! |
dict <- normalizePath(paste0(dir, "/", dict), "/", FALSE) |
118 |
} |
|
119 | 1x |
td <- tryCatch(read.dic(dict), error = function(e) NULL) |
120 | 1x |
dict <- if (is.null(td)) list(cat1 = dict) else td |
121 |
} |
|
122 | 71x |
if (!is.null(dim(dict))) { |
123 | 13x |
if (is.null(colnames(dict))) { |
124 | 1x |
colnames(dict) <- paste0("X", seq_len(ncol(dict))) |
125 |
} else { |
|
126 | ! |
if (!is.data.frame(dict)) dict <- as.data.frame(as.matrix(dict), stringsAsFactors = FALSE) |
127 | 12x |
terms <- if (name.map[["term"]] %in% colnames(dict)) colnames(dict) != name.map[["term"]] else !logical(ncol(dict)) |
128 | 12x |
if (missing(pattern.weights) && !any(pattern.weights %in% colnames(dict))) { |
129 | 6x |
if (any(su <- terms & vapply(dict, is.numeric, TRUE))) { |
130 | 6x |
terms <- terms & !su |
131 | 6x |
pattern.weights <- dict[, su] |
132 |
} |
|
133 |
} |
|
134 | 12x |
if (missing(pattern.categories) && !pattern.categories %in% colnames(dict)) { |
135 | 6x |
if (any(su <- terms & vapply(dict, function(v) !is.numeric(v) && anyDuplicated(v), TRUE))) { |
136 | 1x |
terms <- terms & !su |
137 | 1x |
pattern.categories <- dict[, su] |
138 | ! |
if (sum(su) > 1) pattern.categories <- do.call(paste, pattern.categories) |
139 |
} |
|
140 |
} |
|
141 | 12x |
if (name.map[["term"]] %in% colnames(dict)) { |
142 | 10x |
dict[, name.map[["term"]]] |
143 | 2x |
} else if (!all(terms)) { |
144 | 1x |
dict <- if (any(terms)) dict[, which(terms)[1]] else rownames(dict) |
145 |
} |
|
146 |
} |
|
147 |
} |
|
148 |
# independently entered wide weights |
|
149 | 71x |
if ((is.null(dict) || is.null(dim(dict))) && (!is.null(ncol(pattern.weights)) || !is.null(ncol(pattern.categories)))) { |
150 | 6x |
weights <- if (!is.null(ncol(pattern.weights))) pattern.weights else pattern.categories |
151 | 6x |
if (!is.null(rownames(weights)) && any(grepl("[^0-9]", rownames(weights)))) { |
152 | 1x |
dict <- rownames(weights) |
153 | 5x |
} else if (is.list(dict) && (length(dict) == 1 || |
154 | 5x |
(length(dict[[1]]) == nrow(weights) && all(vapply(dict, length, 0) == nrow(weights))))) { |
155 | 1x |
dict <- dict[[1]] |
156 |
} |
|
157 | ! |
if (length(dict) != nrow(weights)) stop("dict and wide weights do not align") |
158 | 6x |
wide <- TRUE |
159 | 6x |
if (!missing(pattern.categories) && is.character(pattern.categories) && any(su <- pattern.categories %in% weights)) { |
160 | ! |
weights <- weights[, pattern.categories[su], drop = FALSE] |
161 |
} |
|
162 | 6x |
weights <- weights[, vapply(seq_len(ncol(weights)), function(col) is.numeric(weights[, col]), TRUE), drop = FALSE] |
163 | ! |
if (!ncol(weights)) stop("could not identify numeric weights in wide weights") |
164 | 6x |
lex <- list(terms = dict, weights = weights, category = colnames(weights)) |
165 |
# wide weights in dict |
|
166 | 65x |
} else if (!is.null(dim(dict)) && ( |
167 | 65x |
(length(pattern.weights) > 1 && is.character(pattern.weights)) || |
168 | 65x |
(length(pattern.categories) > 1 && |
169 | 65x |
(length(pattern.categories) != nrow(dict) || all(pattern.categories %in% colnames(dict)))) || |
170 | 65x |
(!any(pattern.weights %in% colnames(dict)) && !any(pattern.categories %in% colnames(dict))) |
171 |
)) { |
|
172 | 8x |
if (any(su <- pattern.weights %in% colnames(dict))) { |
173 | 1x |
categories <- pattern.weights[su] |
174 | 7x |
} else if (any(su <- pattern.categories %in% colnames(dict))) { |
175 | 1x |
categories <- pattern.categories |
176 | 6x |
} else if (any(su <- vapply(colnames(dict), function(v) is.numeric(dict[, v]), TRUE))) { |
177 | 6x |
categories <- colnames(dict)[su] |
178 |
} else { |
|
179 | ! |
stop("could not find weights in dict column names") |
180 |
} |
|
181 | 8x |
wide <- TRUE |
182 | 8x |
if (!name.map[["term"]] %in% colnames(dict)) { |
183 | 1x |
terms <- colnames(dict)[vapply(colnames(dict), function(v) !is.numeric(dict[, v]), TRUE)] |
184 | ! |
if (!length(terms)) stop("could not find terms in dict") |
185 | 1x |
name.map[["term"]] <- if (length(terms) > 1) { |
186 | ! |
su <- vapply(terms, function(v) !anyDuplicated(dict[, v]), TRUE) |
187 | ! |
if (any(su)) terms[which(su)[1]] else terms[1] |
188 |
} else { |
|
189 | 1x |
terms |
190 |
} |
|
191 |
} |
|
192 | 8x |
lex <- list(term = dict[, name.map[["term"]]], weights = dict[, categories, drop = FALSE], category = categories) |
193 |
# independently entered weights and categories |
|
194 | 57x |
} else if (is.null(dim(dict))) { |
195 | 53x |
if (is.null(dict) || (is.numeric(dict) && is.null(names(dict))) || (is.list(dict) && is.numeric(dict[[1]]) && |
196 | 53x |
is.null(names(dict[[1]])))) { |
197 | ! |
stop("could not recognize terms in dict") |
198 |
} |
|
199 | 53x |
n <- length(dict) |
200 | 53x |
lex <- data.frame( |
201 | 53x |
term = if (is.character(dict)) { |
202 | 13x |
dict |
203 | 53x |
} else if (is.numeric(dict)) { |
204 | ! |
names(dict) |
205 | 53x |
} else if (is.list(dict) && |
206 | 53x |
is.numeric(dict[[1]])) { |
207 | 4x |
unlist(lapply(dict, names), use.names = FALSE) |
208 |
} else { |
|
209 | 36x |
unlist(dict, use.names = FALSE) |
210 |
}, |
|
211 | 53x |
category = if (length(pattern.categories) == n) { |
212 | 23x |
if (is.list(dict) && !is.null(names(dict))) { |
213 | 3x |
names(dict) |
214 |
} else { |
|
215 | 20x |
pattern.categories |
216 |
} |
|
217 | 53x |
} else if (is.list(dict)) { |
218 | 21x |
rep(if (!is.null(names(dict))) { |
219 | 21x |
names(dict) |
220 |
} else { |
|
221 | ! |
paste0("cat", seq_along(dict)) |
222 | 21x |
}, vapply(dict, length, 0)) |
223 |
} else { |
|
224 | 9x |
"cat1" |
225 |
}, |
|
226 | 53x |
weights = if (is.numeric(dict)) { |
227 | ! |
unname(dict) |
228 | 53x |
} else if (is.numeric(pattern.weights)) { |
229 | 2x |
if (!is.null(names(pattern.weights)) && is.character(dict) && all(dict %in% names(pattern.weights))) { |
230 | ! |
pattern.weights[dict] |
231 |
} else { |
|
232 | 2x |
pattern.weights |
233 |
} |
|
234 | 53x |
} else if (is.list(dict)) { |
235 | 40x |
if (is.numeric(dict[[1]])) { |
236 | 4x |
unlist(dict, use.names = FALSE) |
237 | 36x |
} else if (is.list(pattern.weights) && is.numeric(pattern.weights[[1]])) { |
238 | 2x |
unlist(pattern.weights, use.names = FALSE) |
239 |
} else { |
|
240 | 34x |
1 |
241 |
} |
|
242 |
} else { |
|
243 | 11x |
1 |
244 | 53x |
}, stringsAsFactors = FALSE |
245 |
) |
|
246 |
} else { |
|
247 | 4x |
term <- if ("term" %in% names(name.map)) name.map[["term"]] else "term" |
248 | 4x |
en <- colnames(dict) |
249 | 4x |
if (!term %in% en) { |
250 | 1x |
su <- vapply(en, function(v) !is.numeric(dict[, v]), TRUE) |
251 | 1x |
if (any(su)) { |
252 | 1x |
term <- en[which(su)[1]] |
253 | 1x |
if (sum(su) > 1) { |
254 | 1x |
su <- su & vapply(en, function(v) !anyDuplicated(dict[, v]), TRUE) |
255 | ! |
if (any(su)) term <- en[which(su)[1]] |
256 |
} |
|
257 |
} else { |
|
258 | ! |
stop("could not recognize terms in dict") |
259 |
} |
|
260 |
} |
|
261 | 4x |
lex <- data.frame( |
262 | 4x |
term = dict[[term]], |
263 | 4x |
category = if (length(pattern.categories) == nrow(dict)) { |
264 | ! |
pattern.categories |
265 | 4x |
} else if (pattern.categories %in% en) dict[[pattern.categories]] else "cat1", |
266 | 4x |
weights = if (length(pattern.weights) == nrow(dict)) { |
267 | ! |
pattern.weights |
268 | 4x |
} else if (all(pattern.weights %in% en)) dict[[pattern.weights]] else 1, stringsAsFactors = FALSE |
269 |
) |
|
270 |
} |
|
271 | 2x |
if (any(lex$category == "")) lex[lex$category == "", "category"] <- "cat_unnamed" |
272 | ! |
if (is.factor(lex$term)) lex$term <- as.character(lex$term) |
273 | 71x |
if (globtoregex || !fixed) { |
274 | 17x |
lex$term <- to_regex(list(lex$term), TRUE, globtoregex)[[1]] |
275 | 4x |
if (missing(fixed)) fixed <- FALSE |
276 |
} |
|
277 | 71x |
if (wide && return.dtm) { |
278 | 1x |
wide <- FALSE |
279 | 1x |
lex <- data.frame(term = lex$term, category = if (length(lex$category) == 1) lex$category else "all") |
280 |
} |
|
281 | 71x |
if (!return.dtm && is.null(bias)) { |
282 | ! |
if (!"intname" %in% names(name.map)) name.map[["intname"]] <- "_intercept" |
283 | 51x |
if (any(su <- lex$term == name.map[["intname"]])) { |
284 | 6x |
if (wide) { |
285 | 1x |
bias <- structure(lex$weights[su, ], names = lex$categories[su]) |
286 | 1x |
lex$term <- lex$term[!su] |
287 | 1x |
lex$weights <- lex$weights[!su, , drop = FALSE] |
288 |
} else { |
|
289 | 5x |
bias <- structure(lex[su, "weights"], names = lex[su, "category"]) |
290 | 5x |
lex <- lex[!su, ] |
291 |
} |
|
292 |
} |
|
293 |
} |
|
294 | 71x |
if (exclusive) { |
295 | 62x |
cls <- tryCatch(-nchar(lex$term), error = function(e) NULL) |
296 | 62x |
if (is.null(cls)) { |
297 | ! |
warning( |
298 | ! |
"dict appears to be misencoded, so results may not be as expected;\n", |
299 | ! |
'might try reading the dictionary in with encoding = "latin1"' |
300 |
) |
|
301 | ! |
lex$term <- iconv(lex$term, sub = "#") |
302 | ! |
cls <- -nchar(lex$term) |
303 |
} |
|
304 | 62x |
if (wide) { |
305 | 13x |
o <- order(cls) |
306 | 13x |
lex$term <- lex$term[o] |
307 | 13x |
lex$weights <- lex$weights[o, ] |
308 |
} else { |
|
309 | 49x |
lex <- lex[order(cls), ] |
310 |
} |
|
311 |
} |
|
312 | 71x |
lex$category <- factor(lex$category, unique(lex$category)) |
313 | 71x |
categories <- levels(lex$category) |
314 | 71x |
if (length(bias)) { |
315 | 2x |
if (is.null(names(bias)) && length(bias) == length(categories)) names(bias) <- categories |
316 | ! |
if (any(su <- !categories %in% names(bias))) bias[categories[su]] <- 0 |
317 |
} else { |
|
318 | 61x |
bias <- structure(integer(length(categories)), names = categories) |
319 |
} |
|
320 | 71x |
bias <- bias[categories] |
321 | 8x |
if (is.logical(boundary) && boundary) boundary <- " " |
322 | 71x |
if (missing(to.lower)) { |
323 | 67x |
if (any(grepl("[A-Z]", lex$term))) { |
324 | 2x |
to.lower <- FALSE |
325 | 1x |
if (!any(grepl("[a-z]", lex$term))) text <- toupper(text) |
326 |
} |
|
327 |
} |
|
328 | 65x |
if (to.lower) text <- tolower(text) |
329 | 71x |
st <- proc.time()[[3]] |
330 | 71x |
terms <- unique(lex$term) |
331 | 71x |
if (!fixed) { |
332 | 17x |
ck <- tryCatch( |
333 | 17x |
suppressWarnings(grepl(paste0("(?:", paste(terms, collapse = "|"), ")"), "", perl = TRUE)), |
334 | 17x |
error = function(e) NULL |
335 |
) |
|
336 | 1x |
if (is.null(ck)) stop("terms contain invalid regular expressions", call. = FALSE) |
337 |
} |
|
338 | 70x |
op <- pattern_search( |
339 | 70x |
text, if (is.character(boundary)) paste0(boundary, terms, boundary) else terms, |
340 | 70x |
seq_along(terms) - 1L, fixed, exclusive |
341 |
) |
|
342 | 70x |
colnames(op[[1]]) <- terms |
343 | 70x |
if (return.dtm) { |
344 | 16x |
attr(op[[1]], "categories") <- lapply(categories, function(cat) { |
345 | 63x |
which(colnames(op[[1]]) %in% lex[lex$category == cat, "term"]) |
346 |
}) |
|
347 | 16x |
names(attr(op[[1]], "categories")) <- categories |
348 |
} else { |
|
349 | 54x |
op[[1]] <- vapply(categories, function(cat) { |
350 | 137x |
l <- if (wide) { |
351 | 26x |
data.frame(term = lex$term, weights = if (cat %in% colnames(lex$weights)) { |
352 | 26x |
lex$weights[, cat] |
353 |
} else { |
|
354 | ! |
lex$weights |
355 | 26x |
}, stringsAsFactors = FALSE) |
356 |
} else { |
|
357 | 111x |
lex[lex$category == cat, ] |
358 |
} |
|
359 | 137x |
as.numeric(op[[1]][, l$term, drop = FALSE] %*% l$weights + bias[[cat]]) |
360 | 54x |
}, numeric(length(text))) |
361 | 54x |
if (length(text) == 1) { |
362 | 9x |
op[[1]] <- t(op[[1]]) |
363 | 9x |
rownames(op[[1]]) <- 1 |
364 |
} |
|
365 |
} |
|
366 | ! |
if (length(text_names) == nrow(op[[1]])) rownames(op[[1]]) <- text_names |
367 | 70x |
attr(op[[1]], "WC") <- op[[2]] |
368 | 70x |
attr(op[[1]], "time") <- c(patcat = proc.time()[[3]] - st) |
369 | 1x |
if (drop.zeros) op[[1]] <- op[[1]][, colSums(op[[1]]) != 0, drop = FALSE] |
370 | 70x |
op[[1]] |
371 |
} |
1 |
.onLoad <- function(lib, pkg) { |
|
2 | ! |
if (is.null(getOption("lingmatch.lspace.dir"))) options(lingmatch.lspace.dir = "") |
3 | ! |
if (is.null(getOption("lingmatch.dict.dir"))) options(lingmatch.dict.dir = "") |
4 |
} |
|
5 | ||
6 |
match_metric <- function(x) { |
|
7 | 559x |
mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson") |
8 | 559x |
sel <- if (is.null(x) || (length(x) == 1 && grepl(tolower(substr(x, 1, 1)), "a", fixed = TRUE))) { |
9 | 21x |
mets |
10 | 559x |
} else if (is.function(x)) { |
11 | ! |
stop("only internal metrics are available: ", paste(mets, collapse = ", "), call. = FALSE) |
12 |
} else { |
|
13 | 538x |
if (is.numeric(x)) { |
14 | 1x |
mets[x] |
15 |
} else { |
|
16 | ! |
if (is.call(x)) x <- eval(x) |
17 | 537x |
su <- grepl("^(?:cor|r)", x, TRUE) |
18 | 2x |
if (any(su)) x[su] <- "pearson" |
19 | 537x |
unique(unlist(lapply(substr(x, 1, 3), grep, mets, fixed = TRUE, value = TRUE))) |
20 |
} |
|
21 |
} |
|
22 | 559x |
list(all = mets, selected = sel, dummy = as.integer(mets %in% sel)) |
23 |
} |
|
24 | ||
25 |
to_regex <- function(dict, intext = FALSE, isGlob = TRUE) { |
|
26 | 31x |
lapply(dict, function(l) { |
27 | 34x |
l <- gsub("([+*])[+*]+", "\\\\\\1+", sub("(?<=[^\\\\])\\\\$", "\\\\\\\\", l, perl = TRUE)) |
28 | 34x |
if (isGlob) { |
29 | 24x |
if (any(ck <- grepl("(?:^|\\s+)\\*|(?<=\\w)\\*(?:$|\\s+)", l, perl = TRUE))) { |
30 | 18x |
l[ck] <- gsub("\\*", "[^\\\\s]*", gsub("([.^$?(){}[-]|\\])", "\\\\\\1", l[ck], perl = TRUE)) |
31 |
} |
|
32 |
} |
|
33 | 34x |
if (any(ck <- grepl("[[({]", l) + grepl("[})]|\\]", l) == 1)) { |
34 | 7x |
l[ck] <- gsub("([([{}\\])])", "\\\\\\1", l[ck], perl = TRUE) |
35 |
} |
|
36 | 34x |
l |
37 |
}) |
|
38 |
} |
|
39 | ||
40 |
download.resource <- function( |
|
41 |
type, resource, decompress = TRUE, |
|
42 |
check.md5 = TRUE, mode = "wb", dir = "", overwrite = FALSE) { |
|
43 | 4x |
if (dir == "") { |
44 | ! |
stop(paste0( |
45 | ! |
"specify a directory (dir), or set the ", type, |
46 | ! |
" directory option\n(e.g., options(lingmatch.", type, ".dir = ", |
47 | ! |
'"~/', if (type == "dict") "Dictionaries" else "Latent Semantic Space", |
48 | ! |
'"))\nor initialize it with lma_initdirs()' |
49 | ! |
), call. = FALSE) |
50 |
} |
|
51 | 4x |
all_resources <- rownames(if (type == "dict") dict_info else lss_info) |
52 | ! |
if (length(resource) == 1 && resource == "all") resource <- all_resources |
53 | 4x |
if (length(resource) > 1) { |
54 | 1x |
return(lapply(structure(resource, names = resource), function(d) { |
55 | 2x |
tryCatch( |
56 | 2x |
download.resource( |
57 | 2x |
type = type, resource = d, decompress = decompress, |
58 | 2x |
check.md5 = check.md5, mode = mode, dir = dir |
59 |
), |
|
60 | 2x |
error = function(e) e$message |
61 |
) |
|
62 |
})) |
|
63 |
} |
|
64 | 3x |
dir <- normalizePath(dir, "/", FALSE) |
65 | 1x |
if (resource == "default") resource <- if (type == "dict") "lusi" else "100k_lsa" |
66 | 3x |
name <- grep(paste0("^", sub("\\..*$", "", resource)), all_resources, value = TRUE) |
67 | 3x |
if (!length(name)) { |
68 | 1x |
name <- grep( |
69 | 1x |
paste0("^", substr(resource, 1, 4)), all_resources, TRUE, |
70 | 1x |
value = TRUE |
71 |
) |
|
72 |
} |
|
73 | 3x |
if (!length(name)) { |
74 | ! |
stop( |
75 | ! |
type, " ", resource, " not recognized; see https://osf.io/", |
76 | ! |
if (type == "dict") "y6g5b" else "489he", "/wiki for available resources" |
77 |
) |
|
78 |
} else { |
|
79 | 3x |
name <- name[1] |
80 |
} |
|
81 | 3x |
urls <- list( |
82 | 3x |
info = function(id) paste0("https://api.osf.io/v2/files/", id), |
83 | 3x |
dl = function(id) paste0("https://osf.io/download/", id), |
84 | 3x |
versions = function(id) paste0("https://osf.io/", id, "/?show=revision") |
85 |
) |
|
86 | ! |
if (!dir.exists(dir)) dir.create(dir, recursive = TRUE) |
87 | 3x |
dl <- function(id, ext) { |
88 | 5x |
s <- urls$dl(id) |
89 | 5x |
o <- unique(normalizePath(paste0( |
90 | 5x |
dir, "/", name, c(ext, sub(".bz2", "", ext, fixed = TRUE)) |
91 | 5x |
), "/", FALSE)) |
92 | 5x |
if (any(file.exists(o))) { |
93 | ! |
if (overwrite) { |
94 | ! |
unlink(o) |
95 |
} else { |
|
96 | ! |
return(-1) |
97 |
} |
|
98 |
} |
|
99 | 5x |
status <- tryCatch(download.file(s, o[[1]], mode = mode), error = function(e) 1) |
100 | 5x |
if (!status && check.md5) { |
101 | 5x |
fi <- strsplit(readLines(urls$info(id), 1, TRUE, FALSE, "utf-8"), '[:,{}"]+')[[1]] |
102 | 5x |
ck <- md5sum(o[[1]]) |
103 | 5x |
if (fi[which(fi == "md5") + 1] != ck) { |
104 | ! |
warning(paste0( |
105 | ! |
"MD5 (", ck, ") does not seem to match the one on record;\n", |
106 | ! |
"double check and try manually downloading at ", urls$versions(id) |
107 |
)) |
|
108 |
} |
|
109 |
} |
|
110 | ! |
if (status) warning("failed to download file from ", s, call. = FALSE) |
111 | 5x |
status |
112 |
} |
|
113 | 3x |
if (type == "lspace") { |
114 | 2x |
status <- dl(lss_info[name, "osf_terms"], "_terms.txt") |
115 | 2x |
if (status < 1) status <- dl(lss_info[name, "osf_dat"], ".dat.bz2") |
116 | 2x |
if (status < 1 && decompress) { |
117 | 2x |
if (Sys.which("bunzip2") == "") { |
118 | ! |
warning("could not find path to bunzip2 command for decompression") |
119 |
} else { |
|
120 | 2x |
o <- normalizePath(paste0(dir, "/", name, ".dat.bz2"), "/", FALSE) |
121 | 2x |
if (file.exists(o)) { |
122 | 2x |
status <- tryCatch(system2("bunzip2", shQuote(path.expand(o))), error = function(e) 1) |
123 | 2x |
if (status) { |
124 | ! |
warning( |
125 | ! |
'failed to decompress; might try this from a system console:\n bunzip2 "', path.expand(o), '"' |
126 |
) |
|
127 |
} |
|
128 |
} |
|
129 |
} |
|
130 |
} |
|
131 | 2x |
paths <- normalizePath(paste0( |
132 | 2x |
dir, "/", name, c(".dat", if (!decompress) ".bz2", "_terms.txt") |
133 | 2x |
), "/", FALSE) |
134 |
} else { |
|
135 | 1x |
ext <- if (dict_info[name, "weighted"]) ".csv" else ".dic" |
136 | 1x |
status <- dl(dict_info[name, "osf"], ext) |
137 | 1x |
paths <- normalizePath(paste0(dir, "/", name, ext), "/", FALSE) |
138 |
} |
|
139 | 3x |
if (status < 1) { |
140 | 3x |
message( |
141 | 3x |
paste0(name, " ", type, " ", if (!status) "downloaded" else "exists", ":\n "), |
142 | 3x |
paste(paths, collapse = "\n ") |
143 |
) |
|
144 |
} |
|
145 | 3x |
invisible(paths) |
146 |
} |
1 |
#' Linguistic Matching and Accommodation |
|
2 |
#' |
|
3 |
#' Offers a variety of methods to assess linguistic matching or accommodation, where \emph{matching} |
|
4 |
#' is general similarity (sometimes called \emph{homophily}), and \emph{accommodation} is some form |
|
5 |
#' of conditional similarity (accounting for some base-rate or precedent; sometimes called |
|
6 |
#' \emph{alignment}). |
|
7 |
#' |
|
8 |
#' There are a great many points of decision in the assessment of linguistic similarity and/or |
|
9 |
#' accommodation, partly inherited from the great many point of decision inherent in the numerical |
|
10 |
#' representation of language. Two general types of matching are implemented here as sets of |
|
11 |
#' defaults: Language/Linguistic Style Matching (LSM; Niederhoffer & Pennebaker, 2002; Ireland & |
|
12 |
#' Pennebaker, 2010), and Latent Semantic Analysis/Similarity (LSA; Landauer & Dumais, 1997; |
|
13 |
#' Babcock, Ta, & Ickes, 2014). See the \code{type} argument for specifics. |
|
14 |
#' |
|
15 |
#' @param input Texts to be compared; a vector, document-term matrix (dtm; with terms as column names), |
|
16 |
#' or path to a file (.txt or .csv, with texts separated by one or more lines/rows). |
|
17 |
#' @param comp Defines the comparison to be made: |
|
18 |
#' \itemize{ |
|
19 |
#' \item If a \strong{function}, this will be applied to \code{input} within each group (overall if there is |
|
20 |
#' no group; i.e., \code{apply(input, 2, comp)}; e.g., \code{comp = mean} would compare each text to |
|
21 |
#' the mean profile of its group). |
|
22 |
#' \item If a \strong{character} with a length of 1 and no spaces: |
|
23 |
#' \itemize{ |
|
24 |
#' \item If it partially matches one of \code{lsm_profiles}'s rownames, that row will be used as the comparison. |
|
25 |
#' \item If it partially matches \code{'auto'}, the highest correlating \code{lsm_profiles} row will be used. |
|
26 |
#' \item If it partially matches \code{'pairwise'}, each text will be compared to one another. |
|
27 |
#' \item If it partially matches \code{'sequential'}, the last variable in \code{group} will be treated as |
|
28 |
#' a speaker ID (see the Grouping and Comparisons section). |
|
29 |
#' } |
|
30 |
#' \item If a \strong{character vector}, this will be processed in the same way as \code{input}. |
|
31 |
#' \item If a \strong{vector}, either (a) logical or factor-like (having n levels < length) and of the same length as |
|
32 |
#' \code{nrow(input)}, or (b) numeric or logical of length less than \code{nrow(input)}, this will be used to |
|
33 |
#' select a subset of \code{input} (e.g., \code{1:10} would treat the first 10 rows of \code{input} as the |
|
34 |
#' comparison; \code{lingmatch(text, type == 'prompt', data)} would use the texts in the \code{text} column |
|
35 |
#' identified by the \code{type} column as the comparison). |
|
36 |
#' \item If a \strong{matrix-like object} (having multiple rows and columns), or a named vector, this will |
|
37 |
#' be treated as a sort of dtm, assuming there are common (column) names between \code{input} and |
|
38 |
#' \code{comp} (e.g., if you had prompt and response texts that were already processed separately). |
|
39 |
#' } |
|
40 |
#' @param data A matrix-like object as a reference for column names, if variables are referred to in |
|
41 |
#' other arguments (e.g., \code{lingmatch(text, data = data)} would be the same as |
|
42 |
#' \code{lingmatch(data$text)}. |
|
43 |
#' @param group A logical or factor-like vector the same length as \code{NROW(input)}, used to defined |
|
44 |
#' groups. |
|
45 |
#' @param ... Passes arguments to \code{\link{lma_dtm}}, \code{\link{lma_weight}}, |
|
46 |
#' \code{\link{lma_termcat}}, and/or \code{\link{lma_lspace}} (depending on \code{input} and \code{comp}), |
|
47 |
#' and \code{\link{lma_simets}}. |
|
48 |
#' @param comp.data A matrix-like object as a source for \code{comp} variables. |
|
49 |
#' @param comp.group The column name of the grouping variable(s) in \code{comp.data}; if |
|
50 |
#' \code{group} contains references to column names, and \code{comp.group} is not specified, |
|
51 |
#' \code{group} variables will be looked for in \code{comp.data}. |
|
52 |
#' @param order A numeric vector the same length as \code{nrow(input)} indicating the order of the |
|
53 |
#' texts and grouping variables when the type of comparison is sequential. Only necessary if the |
|
54 |
#' texts are not already ordered as desired. |
|
55 |
#' @param drop logical; if \code{TRUE}, will drop columns with a sum of 0. |
|
56 |
#' @param all.levels logical; if \code{FALSE}, multiple groups are combined. See the Grouping and |
|
57 |
#' Comparisons section. |
|
58 |
#' @param type A character at least partially matching 'lsm' or 'lsa'; applies default settings |
|
59 |
#' aligning with the standard calculations of each type: |
|
60 |
#' \tabular{ll}{ |
|
61 |
#' LSM \tab \code{lingmatch(text, weight = 'freq', dict = lma_dict(1:9), metric = 'canberra')}\cr |
|
62 |
#' LSA \tab \code{lingmatch(text, weight = 'tfidf', space = '100k_lsa', metric = 'cosine')}\cr |
|
63 |
#' } |
|
64 |
#' @section Grouping and Comparisons: |
|
65 |
#' Defining groups and comparisons can sometimes be a bit complicated, and requires dataset |
|
66 |
#' specific knowledge, so it can't always (readily) be done automatically. Variables entered in the |
|
67 |
#' \code{group} argument are treated differently depending on their position and other arguments: |
|
68 |
#' |
|
69 |
#' \describe{ |
|
70 |
#' \item{Splitting}{By default, groups are treated as if they define separate chunks of data in |
|
71 |
#' which comparisons should be calculated. Functions used to calculated comparisons, and |
|
72 |
#' pairwise comparisons are performed separately in each of these groups. For example, if you |
|
73 |
#' wanted to compare each text with the mean of all texts in its condition, a \code{group} |
|
74 |
#' variable could identify and split by condition. Given multiple grouping variables, |
|
75 |
#' calculations will either be done in each split (if \code{all.levels = TRUE}; applied in |
|
76 |
#' sequence so that groups become smaller and smaller), or once after all splits are made (if |
|
77 |
#' \code{all.levels = FALSE}). This makes for 'one to many' comparisons with either calculated |
|
78 |
#' or preexisting standards (i.e., the profile of the current data, or a precalculated profile, |
|
79 |
#' respectively).} |
|
80 |
#' \item{Comparison ID}{When comparison data is identified in \code{comp}, groups are assumed |
|
81 |
#' to apply to both \code{input} and \code{comp} (either both in \code{data}, or separately |
|
82 |
#' between \code{data} and \code{comp.data}, in which case \code{comp.group} may be needed if |
|
83 |
#' the same grouping variable have different names between \code{data} and \code{comp.data}). |
|
84 |
#' In this case, multiple grouping variables are combined into a single factor assumed to |
|
85 |
#' uniquely identify a comparison. This makes for 'one to many' comparisons with specific texts |
|
86 |
#' (as in the case of manipulated prompts or text-based conditions).} |
|
87 |
#' \item{Speaker ID}{If \code{comp} matches \code{'sequential'}, the last grouping variable |
|
88 |
#' entered is assumed to identify something like speakers (i.e., a factor with two or more |
|
89 |
#' levels and multiple observations per level). In this case, the data are assumed to be ordered |
|
90 |
#' (or ordered once sorted by \code{order} if specified). Any additional grouping variables |
|
91 |
#' before the last are treated as splitting groups. This can set up for probabilistic |
|
92 |
#' accommodation metrics. At the moment, when sequential comparisons are made within groups, |
|
93 |
#' similarity scores between speakers are averaged, resulting in mean matching between speakers |
|
94 |
#' within the group.} |
|
95 |
#' } |
|
96 |
#' @references |
|
97 |
#' Babcock, M. J., Ta, V. P., & Ickes, W. (2014). Latent semantic similarity and language style |
|
98 |
#' matching in initial dyadic interactions. \emph{Journal of Language and Social Psychology, 33}, |
|
99 |
#' 78-88. |
|
100 |
#' |
|
101 |
#' Ireland, M. E., & Pennebaker, J. W. (2010). Language style matching in writing: synchrony in |
|
102 |
#' essays, correspondence, and poetry. \emph{Journal of Personality and Social Psychology, 99}, |
|
103 |
#' 549. |
|
104 |
#' |
|
105 |
#' Landauer, T. K., & Dumais, S. T. (1997). A solution to Plato's problem: The latent semantic |
|
106 |
#' analysis theory of acquisition, induction, and representation of knowledge. |
|
107 |
#' \emph{Psychological Review, 104}, 211. |
|
108 |
#' |
|
109 |
#' Niederhoffer, K. G., & Pennebaker, J. W. (2002). Linguistic style matching in social interaction. |
|
110 |
#' \emph{Journal of Language and Social Psychology, 21}, 337-360. |
|
111 |
#' @seealso For a general text processing function, see \code{\link{lma_process}()}. |
|
112 |
#' @return A list with processed components of the input, information about the comparison, and results of |
|
113 |
#' the comparison: |
|
114 |
#' \itemize{ |
|
115 |
#' \item \strong{\code{dtm}}: A sparse matrix; the raw count-dtm, or a version of the original input |
|
116 |
#' if it is more processed. |
|
117 |
#' \item \strong{\code{processed}}: A matrix-like object; a processed version of the input |
|
118 |
#' (e.g., weighted and categorized). |
|
119 |
#' \item \strong{\code{comp.type}}: A string describing the comparison if applicable. |
|
120 |
#' \item \strong{\code{comp}}: A vector or matrix-like object; the comparison data if applicable. |
|
121 |
#' \item \strong{\code{group}}: A string describing the group if applicable. |
|
122 |
#' \item \strong{\code{sim}}: Result of \code{\link{lma_simets}}. |
|
123 |
#' } |
|
124 |
#' @examples |
|
125 |
#' # compare single strings |
|
126 |
#' lingmatch("Compare this sentence.", "With this other sentence.") |
|
127 |
#' |
|
128 |
#' # compare each entry in a character vector with... |
|
129 |
#' texts <- c( |
|
130 |
#' "One bit of text as an entry...", |
|
131 |
#' "Maybe multiple sentences in an entry. Maybe essays or posts or a book.", |
|
132 |
#' "Could be lines or a column from a read-in file..." |
|
133 |
#' ) |
|
134 |
#' |
|
135 |
#' ## one another |
|
136 |
#' lingmatch(texts) |
|
137 |
#' |
|
138 |
#' ## the first |
|
139 |
#' lingmatch(texts, 1) |
|
140 |
#' |
|
141 |
#' ## the next |
|
142 |
#' lingmatch(texts, "seq") |
|
143 |
#' |
|
144 |
#' ## the set average |
|
145 |
#' lingmatch(texts, mean) |
|
146 |
#' |
|
147 |
#' ## other entries in a group |
|
148 |
#' lingmatch(texts, group = c("a", "a", "b")) |
|
149 |
#' |
|
150 |
#' ## one another, without stop words |
|
151 |
#' lingmatch(texts, exclude = "function") |
|
152 |
#' |
|
153 |
#' ## a standard average (based on function words) |
|
154 |
#' lingmatch(texts, "auto", dict = lma_dict(1:9)) |
|
155 |
#' |
|
156 |
#' @export |
|
157 |
#' @import methods Matrix |
|
158 |
#' @importFrom stats na.omit dpois ppois |
|
159 |
#' @importFrom Rcpp sourceCpp |
|
160 |
#' @importFrom RcppParallel RcppParallelLibs |
|
161 |
#' @useDynLib lingmatch, .registration = TRUE |
|
162 | ||
163 |
lingmatch <- function(input = NULL, comp = mean, data = NULL, group = NULL, ..., comp.data = NULL, comp.group = NULL, order = NULL, |
|
164 |
drop = FALSE, all.levels = FALSE, type = "lsm") { |
|
165 | 83x |
inp <- as.list(substitute(...())) |
166 |
# setting up a default type if specified |
|
167 | 83x |
if (!missing(type) && !is.null(type)) { |
168 | 39x |
type <- if (grepl("lsm|lang|ling|style|match", type, TRUE)) "lsm" else "lsa" |
169 | 39x |
ni <- names(inp) |
170 | 31x |
if (type == "lsm" && !"dict" %in% ni) inp$dict <- lma_dict(1:9) |
171 | 1x |
if (type != "lsm" && !"space" %in% ni) inp$space <- "100k_lsa" |
172 | 39x |
if (!"metric" %in% ni) inp$metric <- if (type == "lsm") "canberra" else "cosine" |
173 | 39x |
if (is.null(attr(input, "type")) || length(attr(input, "type")) == 1) { |
174 | 25x |
if (type == "lsm" && !"percent" %in% ni) inp$percent <- TRUE |
175 | 6x |
if (type != "lsm" && !"weight" %in% ni) inp$weight <- "tfidf" |
176 |
} |
|
177 |
} |
|
178 | 83x |
mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson") |
179 | 83x |
inp$metric <- if (!is.null(inp$metric)) match_metric(inp$metric)$selected else "cosine" |
180 | 1x |
if (!length(inp$metric) || all(inp$metric == "")) inp$metric <- "cosine" |
181 | 83x |
vs <- c("input", "comp", "group", "order", "data", "comp.data", "comp.group") |
182 | 83x |
opt <- as.list(match.call(expand.dots = FALSE))[vs] |
183 | 83x |
names(opt) <- vs |
184 |
# organizing options for preprocessing |
|
185 | 83x |
dsp <- lapply(c("lma_dtm", "lma_weight", "lma_lspace", "lma_termcat", "lma_simets"), function(f) { |
186 | 415x |
a <- names(as.list(args(f))) |
187 | 415x |
a <- a[-c(1, length(a))] |
188 | 415x |
inp[a[a %in% names(inp)]] |
189 |
}) |
|
190 | 83x |
names(dsp) <- c("p", "w", "m", "c", "s") |
191 |
# fetches input from data or environment |
|
192 | 83x |
parent_frame <- parent.frame(1) |
193 | 83x |
gv <- function(a, data = NULL) { |
194 | 80x |
ta <- a |
195 | 80x |
if (is.character(a)) { |
196 | ! |
if (!is.null(data) && a %in% colnames(data)) { |
197 | ! |
return(unlist(data[, a])) |
198 | ! |
} else if (length(ta) == 1 || !any(grepl(" ", a, fixed = TRUE))) ta <- parse(text = a) |
199 |
} |
|
200 | 80x |
ta <- tryCatch(eval(ta, parent_frame), error = function(e) NULL) |
201 | 80x |
if (!length(ta) || (!is.null(dim(ta)) && !dim(ta)[1])) { |
202 | 8x |
ta <- tryCatch(eval(a, data, parent_frame), error = function(e) NULL) |
203 | 8x |
if (!length(ta) || (!is.null(dim(ta)) && !dim(ta)[1])) { |
204 | 1x |
ta <- tryCatch(eval(a, globalenv()), error = function(e) NULL) |
205 | 1x |
if (is.null(ta)) { |
206 | 1x |
ta <- tryCatch(eval(a, data), error = function(e) NULL) |
207 | 1x |
if (is.null(ta)) { |
208 | 1x |
p <- 2 |
209 | 1x |
while (is.null(ta) && p < 99) { |
210 | 97x |
p <- p + 1 |
211 | 97x |
ta <- tryCatch(eval(a, parent.frame(p)), error = function(e) NULL) |
212 |
} |
|
213 |
} |
|
214 | 1x |
if (is.null(ta)) stop("could not find ", deparse(a), call. = FALSE) |
215 |
} |
|
216 |
} |
|
217 |
} |
|
218 | 79x |
ta |
219 |
} |
|
220 | 83x |
gd <- function(a, data = NULL) { |
221 | 64x |
r <- if (is.character(a) && length(a) == 1 && grepl("\\.(?:csv|txt|tsv|tab)$", a, TRUE)) { |
222 | 3x |
if (file.exists(a)) { |
223 | 3x |
r <- if (grepl("txt$", a)) { |
224 | 1x |
readLines(a, warn = FALSE) |
225 |
} else { |
|
226 | 2x |
r <- read.table(a, TRUE, if (grepl("csv$", a)) "," else "\t", '"', comment.char = "") |
227 | 2x |
r[, which(!vapply(r, is.numeric, TRUE))[1]] |
228 |
} |
|
229 | 3x |
r[r != ""] |
230 |
} else { |
|
231 | ! |
stop(a, " does not exist", call. = FALSE) |
232 |
} |
|
233 | 64x |
} else if (is.character(a)) a else gv(a, data) |
234 | 2x |
if (is.factor(r)) r <- as.character(r) |
235 | 3x |
if (is.character(r) && length(r) == 1 && grepl("\\.(?:csv|txt|tsv|tab)$", r, TRUE)) r <- gd(r) |
236 | 64x |
r |
237 |
} |
|
238 |
# weight, categorize, and/or map |
|
239 | 83x |
wmc <- function(a) { |
240 | 66x |
if (!is.null(colnames(a)) || (length(dsp$c) == 0 && length(dsp$m) == 0)) { |
241 | 23x |
if (length(dsp$w) != 0) a <- do.call(lma_weight, c(list(a), lapply(dsp$w, eval, parent_frame))) |
242 | 21x |
if (length(dsp$c) != 0) a <- do.call(lma_termcat, c(list(a), lapply(dsp$c, eval, parent_frame))) |
243 | 9x |
if (length(dsp$m) != 0) a <- do.call(lma_lspace, c(list(a), lapply(dsp$m, eval, parent_frame))) |
244 |
} |
|
245 | 66x |
a |
246 |
} |
|
247 |
# initial data parsing |
|
248 |
# input |
|
249 | 83x |
if (missing(input)) { |
250 | 1x |
if (!is.null(data)) { |
251 | 1x |
opt$input <- opt$data |
252 | 1x |
input <- data |
253 |
} else { |
|
254 | ! |
input <- file.choose() |
255 | ! |
opt$input <- input |
256 |
} |
|
257 |
} |
|
258 | 83x |
if (is.function(input) || ((is.list(input) || is.numeric(input)) && is.null(dim(input)))) { |
259 | ! |
stop( |
260 | ! |
"enter a character vector or matrix-like object as input", |
261 | ! |
call. = FALSE |
262 |
) |
|
263 |
} |
|
264 | 83x |
if (missing(data)) { |
265 | 79x |
data <- input |
266 | 12x |
if (is.character(input)) input <- gd(opt$input) |
267 |
} else { |
|
268 | 4x |
input <- if (is.character(input) && all(input %in% colnames(data))) data[, input] else gd(opt$input, data) |
269 |
} |
|
270 | 12x |
if (!missing(group) && is.data.frame(input)) input <- as.matrix(input[, vapply(input, is.numeric, TRUE)]) |
271 | 83x |
rx <- NROW(input) |
272 |
# comp |
|
273 | 83x |
comp_missing <- missing(comp) |
274 | 83x |
if (!comp_missing) { |
275 | 47x |
comp <- gd(opt$comp, if (missing(comp.data)) if (is.call(opt$comp)) NULL else data else comp.data) |
276 | ! |
if (!missing(comp.data) && is.character(comp) && all(comp %in% colnames(comp.data))) comp <- comp.data[, comp] |
277 | 2x |
if (!missing(data) && is.character(comp) && all(comp %in% colnames(data))) comp <- data[, comp] |
278 | 1x |
if (is.logical(comp)) comp <- which(comp) |
279 | 13x |
if (missing(comp.data) && !is.null(colnames(comp))) comp.data <- comp |
280 | 36x |
} else if (comp_missing && missing(group) && missing(comp.data) && missing(comp.group)) { |
281 | 27x |
opt$comp <- comp <- "pairwise" |
282 |
} else { |
|
283 | 9x |
opt$comp <- "mean" |
284 |
} |
|
285 | 22x |
if (length(opt$comp) > 1) opt$comp <- deparse(opt$comp) |
286 | 4x |
if (is.factor(input)) input <- as.character(input) |
287 | 83x |
if (is.factor(comp)) { |
288 | 2x |
comp <- as.character(comp) |
289 | 81x |
} else if (is.data.frame(comp)) { |
290 | 3x |
comp <- comp[, vapply(comp, is.numeric, TRUE)] |
291 |
} |
|
292 | 83x |
do.wmc <- TRUE |
293 | 83x |
if ("dict" %in% names(inp) && any(class(input) %in% c("matrix", "data.frame")) && |
294 | 83x |
is.null(attr(input, "Type"))) { |
295 | 17x |
cn <- colnames(input) |
296 | 17x |
dn <- gv(inp$dict) |
297 | 10x |
if (is.list(dn)) dn <- names(dn) |
298 | 17x |
if (any(!(ck <- dn %in% cn))) { |
299 | 2x |
cat_map <- structure(c(rep(colnames(lsm_profiles), 2), "article", "prep"), names = c( |
300 | 2x |
colnames(lsm_profiles), "personal_pronouns", "impersonal_pronouns", "articles", "auxiliary_verbs", |
301 | 2x |
"adverbs", "prepositions", "conjunctions", "negations", "quantifiers", "articles", "preps" |
302 |
)) |
|
303 | 2x |
cn <- sub("^liwc[ .:_-]+", "", tolower(cn)) |
304 | 2x |
tr <- cn %in% names(cat_map) |
305 | 2x |
if (any(tr)) colnames(input)[tr] <- cat_map[cn[tr]] |
306 | 2x |
ck <- dn %in% colnames(input) |
307 |
} |
|
308 | 17x |
if (sum(ck) / length(ck) > .75) { |
309 | 17x |
inp$dict <- NULL |
310 | ! |
if (any(!ck)) dn <- dn[ck] |
311 | 17x |
input <- input[, dn] |
312 | 17x |
do.wmc <- FALSE |
313 | 17x |
if (!comp_missing && any(class(comp) %in% c("matrix", "data.frame")) && all(dn %in% colnames(comp))) { |
314 | 3x |
comp <- comp[, dn] |
315 |
} |
|
316 |
} |
|
317 |
} |
|
318 | 83x |
if (!is.matrix(input) && is.character(input)) { |
319 |
# if input looks like text, seeing if other text can be added, then converting to a dtm |
|
320 | 15x |
if (is.character(comp) && (length(comp) > 1 || grepl(" ", comp, fixed = TRUE))) { |
321 | 5x |
input <- c(comp, input) |
322 | 5x |
comp <- seq_along(comp) |
323 | 5x |
opt$comp <- "text" |
324 |
} |
|
325 | 15x |
input <- do.call(lma_dtm, c(list(input), dsp$p)) |
326 |
} |
|
327 | 3x |
if (is.data.frame(comp)) comp <- as.matrix(comp) |
328 | 83x |
cc <- if (is.numeric(comp) && (!is.null(comp.data) || is.null(dim(comp)))) { |
329 | 20x |
1 |
330 | 83x |
} else if (is.character(comp)) { |
331 | 45x |
comp <- tolower(comp) |
332 | 45x |
2 |
333 |
} else { |
|
334 | 18x |
0 |
335 |
} |
|
336 |
# group and order |
|
337 | 83x |
agc <- c("c", "list", "cbind", "data.frame") |
338 | 83x |
if (missing(group) && !missing(comp.group)) { |
339 | 2x |
group <- NULL |
340 | 2x |
opt$group <- opt$comp.group |
341 |
} |
|
342 | 83x |
if (!missing(group) && !(is.null(colnames(data)) && rx == length(opt$group) - 1)) { |
343 | 20x |
group <- if (length(opt$group) > 1 && as.character(opt$group[1]) %in% agc && |
344 | 20x |
!grepl("[$[]", as.character(opt$group[1]))) { |
345 | 5x |
group <- tryCatch(gv(opt$group, data), error = function(e) NULL) |
346 | 5x |
if (is.character(group) && is.null(dim(group)) && all(group %in% colnames(data))) { |
347 | 3x |
group <- data[, group] |
348 |
} |
|
349 | 5x |
if (is.null(group)) lapply(opt$group[-1], gv, data) else group |
350 |
} else { |
|
351 | 15x |
if (!is.null(colnames(data)) && is.character(opt$group) && length(opt$group) < nrow(data)) { |
352 | 3x |
if (!all(opt$group %in% colnames(data))) { |
353 | ! |
stop("group appears to be column names, but were not found in data", call. = FALSE) |
354 |
} |
|
355 | 3x |
group <- data[, opt$group] |
356 | 3x |
if (!is.list(group)) group <- if (is.matrix(group)) as.data.frame(group, stringsAsFactors = FALSE) else list(group) |
357 |
} else { |
|
358 | 12x |
group <- gv(opt$group, data) |
359 | 11x |
if (is.factor(group)) { |
360 | 3x |
group <- as.character(group) |
361 | 8x |
} else if (is.matrix(group)) { |
362 | 1x |
group <- as.data.frame(group, row.names = FALSE, stringsAsFactors = FALSE) |
363 |
} |
|
364 | 3x |
if (is.null(dim(group))) list(group) else lapply(group, as.character) |
365 |
} |
|
366 |
} |
|
367 |
} |
|
368 | 82x |
if (!missing(comp.group) || (!is.null(comp.data) && !missing(group))) { |
369 | 3x |
cg <- opt[[if (missing(comp.group)) "group" else "comp.group"]] |
370 | 3x |
if (!is.null(cg)) { |
371 | 3x |
cg <- if (!is.null(comp.data) && length(cg) > 1 && |
372 | 3x |
as.character(cg[1]) %in% agc && !grepl("[$[]", as.character(cg[1]))) { |
373 | 1x |
cg <- tryCatch(gv(cg, comp.data), error = function(e) NULL) |
374 | 1x |
if (is.character(cg) && all(cg %in% colnames(comp.data))) cg <- comp.data[, cg] |
375 | 1x |
if (is.null(cg)) lapply(as.character(cg[-1]), gv, comp.data) else cg |
376 | 3x |
} else if (is.character(cg)) { |
377 | ! |
if (cg %in% colnames(comp.data)) { |
378 | ! |
list(comp.data[, cg]) |
379 |
} else { |
|
380 | ! |
stop("groups not found in comp.data", call. = FALSE) |
381 |
} |
|
382 |
} else { |
|
383 | 2x |
list(gv(cg, comp.data)) |
384 |
} |
|
385 | 3x |
if (is.list(cg) && length(cg) == 1 && !is.null(dim(cg[[1]]))) { |
386 | ! |
cg <- as.data.frame(cg[[1]], stringsAsFactors = FALSE) |
387 | 3x |
} else if (is.character(cg) && !missing(comp.group) && all(cg %in% colnames(comp.data))) { |
388 | ! |
cg <- comp.data[, cg] |
389 |
} |
|
390 | 3x |
if (!missing(comp.group) || length(if (is.list(cg)) cg[[1]] else cg) == nrow(comp.data)) { |
391 | 3x |
if (all.levels) { |
392 | 1x |
comp.group <- cg |
393 |
} else { |
|
394 | 2x |
comp.group <- do.call(paste, cg) |
395 | 2x |
if (length(group) > 1) { |
396 | ! |
group <- do.call(paste, group) |
397 | ! |
if (!is.null(comp.data) && any(ck <- !(ckg <- unique(group)) %in% unique(comp.group))) { |
398 | ! |
if (all(ck)) { |
399 | ! |
stop("group and comp.group had no levels in common", call. = FALSE) |
400 |
} else { |
|
401 | ! |
warning("levels not found in comp.group: ", paste(ckg[ck], collapse = ", "), call. = FALSE) |
402 | ! |
group <- group[ck <- group %in% ckg[!ck]] |
403 | ! |
input <- input[ck, , drop = FALSE] |
404 |
} |
|
405 |
} |
|
406 |
} |
|
407 |
} |
|
408 |
} |
|
409 |
} |
|
410 |
} |
|
411 | 82x |
if (!missing(group)) { |
412 | 2x |
if (is.matrix(group)) group <- as.data.frame(group) |
413 | 19x |
if (length(if (is.list(group)) group[[1]] else group) != rx) { |
414 | ! |
stop("length(group) != nrow(input)", call. = FALSE) |
415 |
} |
|
416 |
} |
|
417 | 82x |
if (!missing(order)) { |
418 | 1x |
order <- gv(opt$order, data) |
419 | 1x |
if (!is.null(order)) { |
420 | 1x |
if (length(order) == rx) { |
421 | 1x |
input <- input[order, ] |
422 | 1x |
group <- lapply(group, "[", order) |
423 |
} else { |
|
424 | ! |
warning("length(order) != nrow(input), so order was not applied", call. = FALSE) |
425 |
} |
|
426 |
} else { |
|
427 | ! |
warning("failed to apply order", call. = FALSE) |
428 |
} |
|
429 |
} |
|
430 | 1x |
if (is.character(input)) input <- matrix(as.numeric(input), rx) |
431 | 82x |
if (is.data.frame(input) && any(ckvc <- !vapply(input, is.numeric, TRUE))) { |
432 | 1x |
if (all(ckvc)) { |
433 | ! |
for (col in seq_along(ckvc)) input[, col] <- as.numeric(input[, col]) |
434 |
} else { |
|
435 | 1x |
input <- input[, !ckvc] |
436 | 1x |
warning("some input variables were not numeric, so they were removed") |
437 |
} |
|
438 |
} |
|
439 | 82x |
dtm <- Matrix(if (is.data.frame(input)) as.matrix(input) else input, sparse = TRUE) |
440 | 66x |
if (do.wmc) input <- wmc(input) |
441 | 82x |
if (is.null(dim(input))) { |
442 | ! |
input <- as.matrix(input) |
443 |
} |
|
444 | 82x |
if (cc == 2 && (length(comp) > 1 || any(grepl(" ", comp, fixed = TRUE)))) { |
445 | ! |
comp <- do.call(lma_dtm, c(list(comp), dsp$p)) |
446 | ! |
cc <- 1 |
447 |
} |
|
448 |
# if comp appears to be a dtm, unifying input and comp |
|
449 | 3x |
if (cc == 1 && !is.null(names(comp))) comp <- t(as.matrix(comp)) |
450 | 82x |
cr <- nrow(comp) |
451 | 82x |
cn <- colnames(comp) |
452 | 82x |
if (!is.null(cn)) { |
453 | 16x |
cc <- 0 |
454 | 16x |
comp.data <- comp |
455 |
} |
|
456 | 82x |
if (drop) { |
457 | 1x |
if (sum(su <- colSums(input, na.rm = TRUE) != 0) != 0) { |
458 | 1x |
input <- input[, su, drop = FALSE] |
459 |
} else { |
|
460 | ! |
stop("input is all 0s after processing", call. = FALSE) |
461 |
} |
|
462 |
} |
|
463 | 82x |
nc <- ncol(input) |
464 |
# finalizing comp |
|
465 | 82x |
if (is.numeric(comp) && (cc == 1 || opt$comp == "text")) { |
466 | 8x |
comp.data <- input[comp, , drop = FALSE] |
467 | 8x |
if (!missing(comp.group) && !all.levels) { |
468 | ! |
if (!anyDuplicated(comp.group) && nrow(comp.data) == length(comp.group)) { |
469 | ! |
rownames(comp.data) <- comp.group |
470 |
} |
|
471 | 8x |
} else if (nrow(comp.data) == 1) { |
472 | ! |
comp.data <- structure(as.numeric(comp.data[1, ]), |
473 | ! |
names = colnames(comp.data) |
474 |
) |
|
475 |
} |
|
476 | 8x |
input <- input[-comp, , drop = FALSE] |
477 | 74x |
} else if (cc == 2) { |
478 | 45x |
ckp <- FALSE |
479 | 45x |
if (grepl("^pa|^se", comp)) { |
480 | 41x |
opt$comp <- if (grepl("^pa", comp)) "pairwise" else "sequential" |
481 | 4x |
} else if (any(!is.na(p <- pmatch(comp, rownames(lsm_profiles))))) { |
482 | 2x |
opt$comp <- rownames(lsm_profiles)[p] |
483 | 2x |
ckp <- TRUE |
484 | 2x |
comp.data <- lsm_profiles[p, , drop = FALSE] |
485 | 2x |
} else if (grepl("^au", comp)) { |
486 | 1x |
p <- colMeans(input, na.rm = TRUE) |
487 | 1x |
p <- which.max(lma_simets(lsm_profiles, p, "pearson")) |
488 | 1x |
opt$comp <- paste("auto:", names(p)) |
489 | 1x |
ckp <- TRUE |
490 | 1x |
comp.data <- lsm_profiles[p, , drop = FALSE] |
491 | 1x |
} else if (!comp_missing) { |
492 | 1x |
stop("`comp` not recognized", call. = FALSE) |
493 |
} |
|
494 | 44x |
if (ckp) { |
495 | 3x |
if (any(ckp <- !(cn <- colnames(input)) %in% (bn <- colnames(comp.data)))) { |
496 | ! |
if (all(ckp)) stop("input and comp have no columns in common", call. = FALSE) |
497 | 2x |
if ("articles" %in% cn && !"articles" %in% bn) bn[bn == "article"] <- "articles" |
498 | 2x |
if ("preps" %in% cn && !"preps" %in% bn) bn[bn == "prep"] <- "preps" |
499 | 2x |
colnames(comp.data) <- bn |
500 | 2x |
if (any(ckp <- !cn %in% bn)) { |
501 | ! |
warning("input columns were not found in comp: ", paste(cn[ckp], collapse = ", "), call. = FALSE) |
502 | ! |
comp.data <- comp.data[, cn[!ckp], drop = FALSE] |
503 |
} |
|
504 |
} else { |
|
505 | 1x |
comp.data <- comp.data[, cn, drop = FALSE] |
506 |
} |
|
507 |
} |
|
508 | 29x |
} else if (!is.null(comp.data)) { |
509 | 18x |
cn <- colnames(input) |
510 | 18x |
cns <- cn[ck <- cn %in% colnames(comp.data)] |
511 | 18x |
if (!any(ck)) { |
512 | ! |
stop("input and comp have no columns in common", call. = FALSE) |
513 | 18x |
} else if (any(!ck)) { |
514 | ! |
warning("input columns were not found in comp: ", paste(cn[!ck], collapse = ", "), call. = FALSE) |
515 | ! |
input <- input[, cns] |
516 |
} |
|
517 | 18x |
comp.data <- comp.data[, cns, drop = FALSE] |
518 |
} |
|
519 | 81x |
compmeanck <- opt$comp == "mean" |
520 | 81x |
sim <- speaker <- NULL |
521 | 81x |
if (!is.null(group)) { |
522 | 19x |
if (!is.null(comp.data) && (NROW(comp.data) == 1 || (is.list(group) && length(group[[1]]) != nrow(input)))) { |
523 | ! |
group <- NULL |
524 | ! |
warning("group does not appear to be meaningful for this comparison, so it was ignored", |
525 | ! |
call. = FALSE |
526 |
) |
|
527 | ! |
} else if (!is.list(group)) group <- list(group) |
528 | 19x |
gl <- length(group) |
529 | 19x |
if (opt$comp == "sequential") { |
530 | 7x |
speaker <- group[[gl]] |
531 | 7x |
group <- if (gl == 1) NULL else group[-gl] |
532 | 7x |
gl <- length(group) |
533 |
} |
|
534 | 19x |
if (gl > 1 && !all.levels) { |
535 | 3x |
group <- list(do.call(paste, group)) |
536 | 3x |
gl <- 1 |
537 |
} |
|
538 | 19x |
if (gl) { |
539 | 16x |
sim <- as.data.frame(group, stringsAsFactors = FALSE) |
540 | 16x |
colnames(sim) <- paste0("g", seq_len(gl)) |
541 | 16x |
for (m in inp$metric) sim[, m] <- NA |
542 | 16x |
mets <- seq_along(inp$metric) + gl |
543 |
} |
|
544 | ! |
} else if (opt$comp == "sequential" && is.null(speaker)) speaker <- seq_len(nrow(input)) |
545 |
# making comparisons |
|
546 | 81x |
sal <- dsp$s |
547 | 81x |
ck_grouppair <- !(!is.null(group) && if (is.null(comp.group)) { |
548 | 13x |
!is.null(rownames(comp.data)) |
549 |
} else { |
|
550 | 3x |
!anyDuplicated(comp.group) |
551 |
}) |
|
552 | 81x |
if (ck_grouppair && !is.logical(sal$mean)) { |
553 | 75x |
sal$mean <- isTRUE(grepl("T", sal$mean, fixed = TRUE)) |
554 |
} |
|
555 | 81x |
ckf <- is.function(comp) |
556 | 81x |
apply_comp <- function(m) { |
557 | 1x |
a <- names(as.list(args(comp))) |
558 | 1x |
if ("na.rm" %in% a) { |
559 | 1x |
apply(m, 2, comp, na.rm = TRUE) |
560 | ! |
} else if ("na.action" %in% a) { |
561 | ! |
apply(m, 2, comp, na.action = na.omit) |
562 |
} else { |
|
563 | ! |
apply(m, 2, comp) |
564 |
} |
|
565 |
} |
|
566 | 81x |
if (is.null(group)) { |
567 | 3x |
if (!is.null(speaker)) sal$group <- speaker |
568 | 65x |
if (!is.null(comp.data)) { |
569 | 26x |
if (ckf) { |
570 | 1x |
opt$comp <- paste(if (length(opt$comp.data) > 1) deparse(opt$comp.data) else opt$comp.data, opt$comp) |
571 | 1x |
sal$b <- comp.data <- if (is.null(dim(comp.data))) { |
572 | ! |
comp.data |
573 | 1x |
} else if (compmeanck) colMeans(comp.data, na.rm = TRUE) else apply_comp(comp.data) |
574 |
} else { |
|
575 | 25x |
sal$b <- comp.data |
576 |
} |
|
577 | 39x |
} else if (ckf) { |
578 | 1x |
sal$b <- comp.data <- if (compmeanck) { |
579 | ! |
colMeans(input, na.rm = TRUE) |
580 |
} else { |
|
581 | 1x |
apply_comp(input) |
582 |
} |
|
583 |
} |
|
584 | 4x |
if (!"b" %in% names(sal) && (is.numeric(comp) || !is.null(dim(comp)))) sal$b <- comp |
585 | 65x |
sim <- do.call(lma_simets, c(list(input), sal)) |
586 |
} else { |
|
587 | 16x |
gs <- as.character(unique(sim[, 1])) |
588 | 16x |
cks <- !is.null(speaker) |
589 | 16x |
ckc <- !is.null(comp.data) |
590 | 16x |
ckp <- cc == 2 && opt$comp == "pairwise" |
591 | 16x |
if (gl == 1) { |
592 | 13x |
if (opt$comp != "pairwise") { |
593 | 10x |
if (opt$comp == "sequential") { |
594 | 4x |
group <- sim[, 1] |
595 | 4x |
sim <- do.call(rbind, lapply(gs, function(g) { |
596 | 32x |
su <- which(group == g) |
597 | 32x |
s <- speaker[su] |
598 | 32x |
r <- if (length(su) < 2 || length(unique(s)) < 2) { |
599 | ! |
data.frame(group = g, structure(as.list(numeric(length(mets)) + 1), |
600 | ! |
names = inp$metric |
601 | ! |
), row.names = paste(su, collapse = ", "), stringsAsFactors = FALSE) |
602 |
} else { |
|
603 | 32x |
sal$group <- s |
604 | 32x |
r <- do.call(lma_simets, c(list(input[su, , drop = FALSE]), sal)) |
605 | 32x |
rs <- as.integer(unlist(strsplit(rownames(r), "[^0-9]+"))) |
606 | 32x |
rownames(r) <- strsplit(do.call(sprintf, c( |
607 | 32x |
paste(gsub("[0-9]+", "%i", rownames(r)), collapse = "|"), as.list(rs - 1 + su[1]) |
608 | 32x |
)), "|", fixed = TRUE)[[1]] |
609 | 32x |
data.frame(group = g, r, stringsAsFactors = FALSE) |
610 |
} |
|
611 |
})) |
|
612 |
} else { |
|
613 | 6x |
if (is.null(sal$pairwise)) sal$pairwise <- ck_grouppair |
614 | 6x |
flat <- ckf || !isTRUE(sal$pairwise) || isTRUE(sal$mean) |
615 | 6x |
sal$return.list <- !flat |
616 | 1x |
if (!flat) fsim <- list() |
617 | 6x |
ckmc <- FALSE |
618 | 6x |
if (!ckc && ckf) { |
619 | 4x |
ckmc <- TRUE |
620 | 4x |
opt$comp <- paste0(if (length(opt$group) == 1) paste(opt$group, ""), "group ", opt$comp) |
621 | 4x |
comp.data <- as.data.frame( |
622 | 4x |
matrix(NA, length(gs), nc, dimnames = list(gs, colnames(input))), |
623 | 4x |
stringsAsFactors = FALSE |
624 |
) |
|
625 |
} |
|
626 | 6x |
for (g in gs) { |
627 | 36x |
su <- sim[, 1] == g |
628 | 36x |
sal$b <- NULL |
629 | 36x |
if (ckc) { |
630 | 4x |
sal$b <- comp.data[if (!is.null(comp.group)) comp.group == g else g, , drop = FALSE] |
631 |
} else { |
|
632 | 32x |
sal$b <- input[su, , drop = FALSE] |
633 |
} |
|
634 | 36x |
if (ckf && !is.null(dim(sal$b))) { |
635 | 34x |
sal$b <- if (compmeanck) colMeans(sal$b, na.rm = TRUE) else apply_comp(sal$b) |
636 |
} |
|
637 | 32x |
if (!is.null(sal$b) && ckmc) comp.data[g, ] <- sal$b |
638 | 36x |
if (sum(su) == 1 && is.null(sal$b)) { |
639 | ! |
sim[su, mets] <- 1 |
640 | ! |
next |
641 |
} |
|
642 | 36x |
tm <- do.call(lma_simets, c(list(input[su, , drop = FALSE]), sal)) |
643 | 36x |
if (flat) { |
644 | 34x |
sim[su, mets] <- tm |
645 |
} else { |
|
646 | 2x |
fsim[[g]] <- tm |
647 |
} |
|
648 |
} |
|
649 | 1x |
if (!flat) sim <- fsim |
650 |
} |
|
651 |
} else { |
|
652 | 3x |
ug <- unique(group[[1]]) |
653 | 3x |
if (isTRUE(sal$mean)) { |
654 | 1x |
sim <- data.frame(group[[1]], NA, stringsAsFactors = FALSE) |
655 | 1x |
colnames(sim) <- c(opt$group, sal$metric) |
656 | 1x |
for (g in ug) { |
657 | 2x |
su <- group[[1]] == g |
658 | 2x |
sim[su, -1] <- if (sum(su) == 1) { |
659 | ! |
1 |
660 |
} else { |
|
661 | 2x |
do.call( |
662 | 2x |
lma_simets, c(list(input[su, , drop = FALSE]), sal) |
663 |
) |
|
664 |
} |
|
665 |
} |
|
666 |
} else { |
|
667 | 2x |
sim <- lapply(structure(ug, names = ug), function(g) { |
668 | 6x |
su <- group[[1]] == g |
669 | 6x |
if (sum(su) != 1) { |
670 | 6x |
do.call(lma_simets, c(list(input[su, , drop = FALSE]), sal)) |
671 |
} else { |
|
672 | ! |
rep(NA, length(sal$metric)) |
673 |
} |
|
674 |
}) |
|
675 |
} |
|
676 |
} |
|
677 | 3x |
} else if (gl > 1) { |
678 | 3x |
for (i in seq_len(gl - 1)) sim <- cbind(sim, sim[, mets]) |
679 | 3x |
sug <- seq_len(gl) |
680 | 3x |
cn <- paste0("g", sug) |
681 | 3x |
mn <- length(inp$metric) |
682 | 3x |
mw <- seq_len(mn) |
683 | 3x |
colnames(sim)[-sug] <- paste0(rep(vapply(seq_along(cn), function(e) { |
684 | 6x |
paste0(cn[seq_len(e)], collapse = "_") |
685 | 3x |
}, ""), each = mn), "_", inp$metric) |
686 | 3x |
group <- vapply(sug, function(g) do.call(paste, group[seq_len(g)]), character(nrow(sim))) |
687 | 3x |
if (!missing(comp.group)) { |
688 | 1x |
comp.group <- vapply(sug, function(g) { |
689 | 2x |
do.call(paste, comp.group[seq_len(g)]) |
690 | 1x |
}, character(length(comp.group[[1]]))) |
691 |
} |
|
692 | 2x |
if (is.null(sal$pairwise)) sal$pairwise <- ck_grouppair |
693 | 3x |
flat <- ckf || isTRUE(sal$mean) |
694 | 1x |
if (!flat) fsim <- list() |
695 | 3x |
ssl <- if (is.null(speaker)) TRUE else !is.na(speaker) |
696 | 3x |
for (g in unique(sim[, 1])) { |
697 | 2x |
if (!flat && is.null(fsim[[g]])) fsim[[g]] <- list() |
698 | 6x |
su <- which(sim[, 1] == g & ssl) |
699 | 6x |
sg <- group[su, , drop = FALSE] |
700 | 6x |
sx <- input[su, , drop = FALSE] |
701 | 6x |
gck <- ckc && !missing(comp.group) |
702 | 6x |
if (gck) { |
703 | 2x |
gcsub <- comp.group[, 1] == g |
704 | 2x |
if (!any(gcsub)) { |
705 | ! |
warning("the first comparison group has no levels in common with the first data group", |
706 | ! |
call. = FALSE |
707 |
) |
|
708 | ! |
gck <- FALSE |
709 |
} |
|
710 |
} |
|
711 | 6x |
for (s in sug) { |
712 | 12x |
usg <- unique(sg[, s]) |
713 | 12x |
if (length(usg) == 1) { |
714 | 6x |
ssg <- list(sx) |
715 | 6x |
names(ssg) <- usg |
716 |
} else { |
|
717 | 6x |
ssg <- lapply(usg, function(ss) sx[sg[, s] == ss, , drop = FALSE]) |
718 | 6x |
names(ssg) <- usg |
719 |
} |
|
720 | 12x |
if (length(ssg) != 0) { |
721 | 12x |
for (ssn in names(ssg)) { |
722 | 18x |
ssu <- su[sg[, s] == ssn] |
723 | 6x |
if (!flat && is.null(fsim[[g]][[ssn]])) fsim[[g]][[ssn]] <- list() |
724 | 18x |
if (cks) { |
725 | ! |
sal$group <- speaker[ssu] |
726 | 18x |
} else if (ckf && !is.null(dim(ssg[[ssn]]))) { |
727 | 12x |
sal$b <- if (compmeanck) { |
728 | 12x |
colMeans(ssg[[ssn]], na.rm = TRUE) |
729 |
} else { |
|
730 | ! |
apply_comp(ssg[[ssn]]) |
731 |
} |
|
732 |
} |
|
733 | 18x |
csu <- gl + mw + (mn * (s - 1)) |
734 | 18x |
if (gck) { |
735 | 6x |
gcsu <- comp.group[, s] == ssn & gcsub |
736 | 6x |
if (!any(gcsu)) { |
737 | ! |
warning( |
738 | ! |
"no ", paste(usg, collapse = ", "), " level found in the comparison group(s)" |
739 |
) |
|
740 |
} else { |
|
741 | 6x |
sal$b <- comp.data[gcsu, , drop = FALSE] |
742 |
} |
|
743 |
} |
|
744 | 18x |
ssim <- do.call(lma_simets, c(list(ssg[[ssn]]), sal)) |
745 | 18x |
if (flat) { |
746 | 12x |
sim[ssu, csu] <- ssim |
747 |
} else { |
|
748 | 6x |
fsim[[g]][[ssn]][[colnames(sim)[csu]]] <- ssim |
749 |
} |
|
750 |
} |
|
751 |
} |
|
752 |
} |
|
753 |
} |
|
754 | 1x |
if (!flat) sim <- fsim |
755 |
} |
|
756 |
} |
|
757 | 81x |
list( |
758 | 81x |
dtm = dtm, |
759 | 81x |
processed = input, |
760 | 81x |
comp.type = if (!is.null(opt$comp)) { |
761 | 81x |
if (is.character(opt$comp)) { |
762 | 75x |
opt$comp |
763 |
} else { |
|
764 | 6x |
gsub('"', "'", as.character(deparse(opt$comp))) |
765 |
} |
|
766 |
}, |
|
767 | 81x |
comp = comp.data, |
768 | 81x |
group = if (!is.null(opt$group)) { |
769 | 19x |
if (is.character(opt$group)) { |
770 | 3x |
opt$group |
771 |
} else { |
|
772 | 16x |
gsub('"', "'", as.character(deparse(opt$group))) |
773 |
} |
|
774 |
}, |
|
775 | 81x |
sim = sim |
776 |
) |
|
777 |
} |
1 |
#' Process Text |
|
2 |
#' |
|
3 |
#' A wrapper to other pre-processing functions, potentially from \code{\link{read.segments}}, to \code{\link{lma_dtm}} |
|
4 |
#' or \code{\link{lma_patcat}}, to \code{\link{lma_weight}}, then \code{\link{lma_termcat}} or \code{\link{lma_lspace}}, |
|
5 |
#' and optionally including \code{\link{lma_meta}} output. |
|
6 |
#' |
|
7 |
#' @param input A vector of text, or path to a text file or folder. |
|
8 |
#' @param ... arguments to be passed to \code{\link{lma_dtm}}, \code{\link{lma_patcat}}, \code{\link{lma_weight}}, |
|
9 |
#' \code{\link{lma_termcat}}, and/or \code{\link{lma_lspace}}. All arguments must be named. |
|
10 |
#' @param meta Logical; if \code{FALSE}, metastatistics are not included. Only applies when raw text is available. |
|
11 |
#' If included, meta categories are added as the last columns, with names starting with "meta_". |
|
12 |
#' @param coverage Logical; if \code{TRUE} and a dictionary is provided (\code{dict}), |
|
13 |
#' will calculate the coverage (number of unique term matches) of each dictionary category. |
|
14 |
#' @return A matrix with texts represented by rows, and features in columns, unless there are multiple rows per output |
|
15 |
#' (e.g., when a latent semantic space is applied without terms being mapped) in which case only the special output |
|
16 |
#' is returned (e.g., a matrix with terms as rows and latent dimensions in columns). |
|
17 |
#' @seealso If you just want to compare texts, see the \code{\link{lingmatch}()} function. |
|
18 |
#' @examples |
|
19 |
#' # starting with some texts in a vector |
|
20 |
#' texts <- c( |
|
21 |
#' "Firstly, I would like to say, and with all due respect...", |
|
22 |
#' "Please, proceed. I hope you feel you can speak freely...", |
|
23 |
#' "Oh, of course, I just hope to be clear, and not cause offense...", |
|
24 |
#' "Oh, no, don't monitor yourself on my account..." |
|
25 |
#' ) |
|
26 |
#' |
|
27 |
#' # by default, term counts and metastatistics are returned |
|
28 |
#' lma_process(texts) |
|
29 |
#' |
|
30 |
#' # add dictionary and percent arguments for standard dictionary-based results |
|
31 |
#' lma_process(texts, dict = lma_dict(), percent = TRUE) |
|
32 |
#' |
|
33 |
#' # add space and weight arguments for standard word-centroid vectors |
|
34 |
#' lma_process(texts, space = lma_lspace(texts), weight = "tfidf") |
|
35 |
#' @export |
|
36 | ||
37 |
lma_process <- function(input = NULL, ..., meta = TRUE, coverage = FALSE) { |
|
38 | 23x |
inp <- as.list(substitute(...())) |
39 | 23x |
funs <- c("read.segments", "lma_dtm", "lma_weight", "lma_lspace", "lma_termcat", "lma_patcat") |
40 | 23x |
allargs <- NULL |
41 | 23x |
arg_matches <- list() |
42 | 23x |
for (f in funs) { |
43 | 138x |
a <- names(as.list(args(f))) |
44 | 138x |
a <- a[c(FALSE, a[-1] %in% names(inp))] |
45 | 138x |
allargs <- c(allargs, a) |
46 | 138x |
arg_matches[[f]] <- inp[a] |
47 |
} |
|
48 | 23x |
dupargs <- unique(allargs[duplicated(allargs)]) |
49 | 23x |
arg_checks <- vapply(arg_matches, function(l) sum(!names(l) %in% dupargs), 0) |
50 |
# identify input |
|
51 | 23x |
op <- NULL |
52 | 1x |
if (is.function(input)) stop("enter a character vector or matrix-like object as input") |
53 | 2x |
if (is.null(dim(input)) && is.list(input) && is.character(input[[1]])) input <- unlist(input, use.names = FALSE) |
54 | 22x |
if (is.character(input) || is.factor(input)) { |
55 | 12x |
ck_paths <- length(input) != 1 && all(file.exists(input)) |
56 | 12x |
op <- if (length(arg_matches$read.segments) || ck_paths) { |
57 | 1x |
an <- names(arg_matches$read.segments) |
58 | 1x |
if (!any(grepl("path|text", an))) arg_matches$read.segments$path <- input |
59 | 1x |
do.call(read.segments, lapply(arg_matches$read.segments, eval.parent, 2)) |
60 |
} else { |
|
61 | 11x |
data.frame( |
62 | 11x |
text = if (length(input) == 1 && file.exists(input)) readLines(input) else input, stringsAsFactors = FALSE |
63 |
) |
|
64 |
} |
|
65 |
} else { |
|
66 | 10x |
if (is.null(dim(input))) { |
67 | 3x |
if (is.null(names(input)) || (is.list(input) && !all(vapply(input, length, 0) == length(input[[1]])))) { |
68 | 1x |
stop("input is not of a recognized format -- should be text or a dtm-like object") |
69 |
} |
|
70 | 2x |
input <- if (is.list(input)) as.data.frame(input, stringsAsFactors = FALSE) else t(input) |
71 |
} |
|
72 | 9x |
op <- input |
73 |
} |
|
74 |
# process |
|
75 | 21x |
ck_text <- "text" %in% colnames(op) && is.character(op[, "text"]) |
76 | 21x |
ck_changed <- FALSE |
77 | 21x |
if (ck_text) { |
78 | 13x |
if (arg_checks[["lma_patcat"]]) { |
79 | 3x |
if (!"return.dtm" %in% names(arg_matches$lma_patcat) && (coverage || length(arg_matches$lma_weight))) { |
80 | 3x |
arg_matches$lma_patcat$return.dtm <- TRUE |
81 |
} |
|
82 | 3x |
arg_matches$lma_patcat$text <- op[, "text"] |
83 | 3x |
x <- do.call(lma_patcat, lapply(arg_matches$lma_patcat, eval.parent, 2)) |
84 | 3x |
ck_changed <- TRUE |
85 |
} else { |
|
86 | 10x |
arg_matches$lma_dtm$text <- op[, "text"] |
87 | 10x |
x <- do.call(lma_dtm, lapply(arg_matches$lma_dtm, eval.parent, 2)) |
88 | 10x |
ck_changed <- TRUE |
89 |
} |
|
90 |
} else { |
|
91 | 8x |
x <- op |
92 |
} |
|
93 | 21x |
if (arg_checks[["lma_weight"]]) { |
94 | 10x |
arg_matches$lma_weight$dtm <- x |
95 | 10x |
x <- do.call(lma_weight, lapply(arg_matches$lma_weight, eval.parent, 2)) |
96 | 10x |
attr(x, "categories") <- attr(arg_matches$lma_weight$dtm, "categories") |
97 | 10x |
ck_changed <- TRUE |
98 |
} |
|
99 | 21x |
if (!is.null(attr(x, "categories"))) { |
100 | 5x |
categories <- attr(x, "categories") |
101 | 5x |
xc <- as.data.frame( |
102 | 5x |
matrix(0, nrow(op), length(categories), dimnames = list(NULL, names(categories))), |
103 | 5x |
stringsAsFactors = FALSE |
104 |
) |
|
105 | 5x |
if (coverage) { |
106 | 1x |
cxc <- xc |
107 | 1x |
for (cat in names(categories)) { |
108 | 3x |
su <- x[, categories[[cat]], drop = FALSE] |
109 | 3x |
xc[, cat] <- rowSums(su, na.rm = TRUE) |
110 | 3x |
cxc[, cat] <- rowSums(su != 0, na.rm = TRUE) |
111 |
} |
|
112 | 1x |
colnames(cxc) <- paste0("coverage_", colnames(xc)) |
113 | 1x |
xc <- cbind(xc, cxc) |
114 |
} else { |
|
115 | 4x |
for (cat in names(categories)) { |
116 | 48x |
xc[, cat] <- rowSums(x[, categories[[cat]], drop = FALSE], na.rm = TRUE) |
117 |
} |
|
118 |
} |
|
119 | 5x |
x <- xc |
120 | 5x |
ck_changed <- TRUE |
121 | 16x |
} else if (arg_checks[["lma_termcat"]] || (length(arg_matches$lma_termcat) && |
122 | 16x |
any(names(arg_matches$lma_termcat) != "dir"))) { |
123 | 9x |
arg_matches$lma_termcat$coverage <- coverage |
124 | 9x |
arg_matches$lma_termcat$dtm <- x |
125 | 9x |
x <- do.call(lma_termcat, lapply(arg_matches$lma_termcat, eval.parent, 2)) |
126 | 9x |
ck_changed <- TRUE |
127 |
} |
|
128 | 21x |
if (arg_checks[["lma_lspace"]]) { |
129 | 2x |
nr <- NROW(x) |
130 | 2x |
arg_matches$lma_lspace$dtm <- x |
131 | 2x |
x <- do.call(lma_lspace, lapply(arg_matches$lma_lspace, eval.parent, 2)) |
132 | 2x |
if (nrow(x) != nr) { |
133 | 1x |
colnames(x) <- paste0("dim", seq_len(ncol(x))) |
134 | 1x |
return(x) |
135 |
} |
|
136 | 1x |
ck_changed <- TRUE |
137 |
} |
|
138 | 5x |
if (any(grepl("Matrix", class(x), fixed = TRUE))) x <- as.matrix(x) |
139 | 15x |
if (is.matrix(x)) x <- as.data.frame(x, stringsAsFactors = FALSE) |
140 | 20x |
op <- if (ck_text && ck_changed) cbind(op, if (is.null(dim(x))) t(x) else x) else x |
141 | 20x |
if (ck_text && meta) { |
142 | 10x |
opm <- lma_meta(op[, "text"]) |
143 | 10x |
if (arg_checks[["lma_weight"]] && |
144 | 10x |
(!"normalize" %in% names(arg_matches$lma_weight) || arg_matches$lma_weight$normalize)) { |
145 | 3x |
cols <- c(9, 14:23) |
146 | 3x |
opm_counts <- opm[, cols] |
147 | 3x |
su <- opm_counts != 0 |
148 | 3x |
adj <- if ("percent" %in% names(arg_matches$lma_weight) && arg_matches$lma_weight$percent) 100 else 1 |
149 | 3x |
opm_counts[su] <- opm_counts[su] / rep(opm$words, length(cols))[which(su)] * adj |
150 | 3x |
opm[, cols] <- opm_counts |
151 |
} |
|
152 | 10x |
colnames(opm) <- paste0("meta_", colnames(opm)) |
153 | 10x |
op <- cbind(op, opm) |
154 |
} |
|
155 | 20x |
op |
156 |
} |
1 |
#' English Function Word Category and Special Character Lists |
|
2 |
#' |
|
3 |
#' Returns a list of function words based on the Linguistic Inquiry and Word Count 2015 dictionary |
|
4 |
#' (in terms of category names -- words were selected independently), or a list of special characters and patterns. |
|
5 |
#' @param ... Numbers or letters corresponding to category names: ppron, ipron, article, |
|
6 |
#' adverb, conj, prep, auxverb, negate, quant, interrog, number, interjection, or special. |
|
7 |
#' @param as.regex Logical: if \code{FALSE}, lists are returned without regular expression. |
|
8 |
#' @param as.function Logical or a function: if specified and \code{as.regex} is \code{TRUE}, the selected dictionary |
|
9 |
#' will be collapsed to a regex string (terms separated by \code{|}), and a function for matching characters to that |
|
10 |
#' string will be returned. The regex string is passed to the matching function (\code{\link{grepl}} by default) |
|
11 |
#' as a 'pattern' argument, with the first argument of the returned function being passed as an 'x' argument. |
|
12 |
#' See examples. |
|
13 |
#' @note |
|
14 |
#' The \code{special} category is not returned unless specifically requested. It is a list of regular expression |
|
15 |
#' strings attempting to capture special things like ellipses and emojis, or sets of special characters (those outside |
|
16 |
#' of the Basic Latin range; \code{[^\\u0020-\\u007F]}), which can be used for character conversions. |
|
17 |
#' If \code{special} is part of the returned list, \code{as.regex} is set to \code{TRUE}. |
|
18 |
#' |
|
19 |
#' The \code{special} list is always used by both \code{\link{lma_dtm}} and \code{\link{lma_termcat}}. When creating a |
|
20 |
#' dtm, \code{special} is used to clean the original input (so that, by default, the punctuation involved in ellipses |
|
21 |
#' and emojis are treated as different -- as ellipses and emojis rather than as periods and parens and colons and such). |
|
22 |
#' When categorizing a dtm, the input dictionary is passed by the special lists to be sure the terms in the dtm match up |
|
23 |
#' with the dictionary (so, for example, ": (" would be replaced with "repfrown" in both the text and dictionary). |
|
24 |
#' @seealso To score texts with these categories, use \code{\link{lma_termcat}()}. |
|
25 |
#' @return A list with a vector of terms for each category, or (when \code{as.function = TRUE}) a function which |
|
26 |
#' accepts an initial "terms" argument (a character vector), and any additional arguments determined by function |
|
27 |
#' entered as \code{as.function} (\code{\link{grepl}} by default). |
|
28 |
#' @examples |
|
29 |
#' # return the full dictionary (excluding special) |
|
30 |
#' lma_dict() |
|
31 |
#' |
|
32 |
#' # return the standard 7 category lsm categories |
|
33 |
#' lma_dict(1:7) |
|
34 |
#' |
|
35 |
#' # return just a few categories without regular expression |
|
36 |
#' lma_dict(neg, ppron, aux, as.regex = FALSE) |
|
37 |
#' |
|
38 |
#' # return special specifically |
|
39 |
#' lma_dict(special) |
|
40 |
#' |
|
41 |
#' # returning a function |
|
42 |
#' is.ppron <- lma_dict(ppron, as.function = TRUE) |
|
43 |
#' is.ppron(c("i", "am", "you", "were")) |
|
44 |
#' |
|
45 |
#' in.lsmcat <- lma_dict(1:7, as.function = TRUE) |
|
46 |
#' in.lsmcat(c("a", "frog", "for", "me")) |
|
47 |
#' |
|
48 |
#' ## use as a stopword filter |
|
49 |
#' is.stopword <- lma_dict(as.function = TRUE) |
|
50 |
#' dtm <- lma_dtm("Most of these words might not be all that relevant.") |
|
51 |
#' dtm[, !is.stopword(colnames(dtm))] |
|
52 |
#' |
|
53 |
#' ## use to replace special characters |
|
54 |
#' clean <- lma_dict(special, as.function = gsub) |
|
55 |
#' clean(c( |
|
56 |
#' "\u201Ccurly quotes\u201D", "na\u00EFve", "typographer\u2019s apostrophe", |
|
57 |
#' "en\u2013dash", "em\u2014dash" |
|
58 |
#' )) |
|
59 |
#' @export |
|
60 | ||
61 |
lma_dict <- function(..., as.regex = TRUE, as.function = FALSE) { |
|
62 | 94x |
cats <- as.character(substitute(...())) |
63 | 94x |
dict <- list( |
64 | 94x |
ppron = c( |
65 | 94x |
"^dae$", "^dem$", "^eir$", "^eirself$", "^em$", "^he$", "^he'", "^her$", "^hers$", "^herself$", "^hes$", |
66 | 94x |
"^him$", "^himself$", "^hir$", "^hirs$", "^hirself$", "^his$", "^hisself$", "^i$", "^i'", "^id$", "^idc$", |
67 | 94x |
"^idgaf$", "^idk$", "^idontknow$", "^idve$", "^iirc$", "^iknow$", "^ikr$", "^ill$", "^ily$", "^im$", "^ima$", |
68 | 94x |
"^imean$", "^imma$", "^ive$", "^lets$", "^let's$", "^me$", "^methinks$", "^mine$", "^my$", "^myself$", "^omfg$", |
69 | 94x |
"^omg$", "^oneself$", "^our$", "^ours", "^she$", "^she'", "^shes$", "^thee$", "^their$", "^their'", "^theirs", |
70 | 94x |
"^them$", "^thems", "^they$", "^they'", "^theyd$", "^theyll$", "^theyve$", "^thine$", "^thou$", "^thoust$", |
71 | 94x |
"^thy$", "^thyself$", "^u$", "^u'", "^ud$", "^ull$", "^ur$", "^ure$", "^us$", "^we$", "^we'", "^weve$", "^y'", |
72 | 94x |
"^ya'", "^yall", "^yins$", "^yinz$", "^you$", "^you'", "^youd$", "^youll$", "^your$", "^youre$", "^yours$", |
73 | 94x |
"^yourself$", "^yourselves$", "^youve$", "^zer$", "^zir$", "^zirs$", "^zirself$", "^zis$" |
74 |
), |
|
75 | 94x |
ipron = c( |
76 | 94x |
"^another$", "^anybo", "^anyone", "^anything", "^dat$", "^de+z$", "^dis$", "^everyb", "^everyone", |
77 | 94x |
"^everything", "^few$", "^it$", "^it'$", "^it'", "^itd$", "^itll$", "^its$", "^itself$", "^many$", "^nobod", |
78 | 94x |
"^nothing$", "^other$", "^others$", "^same$", "^somebo", "^somebody'", "^someone", "^something", "^stuff$", |
79 | 94x |
"^that$", "^that'", "^thatd$", "^thatll$", "^thats$", "^these$", "^these'", "^thesed$", "^thesell$", "^thesere$", |
80 | 94x |
"^thing", "^this$", "^this'", "^thisd$", "^thisll$", "^those$", "^those'", "^thosed$", "^thosell$", "^thosere$", |
81 | 94x |
"^what$", "^what'", "^whatd$", "^whatever$", "^whatll$", "^whats$", "^which", "^who$", "^who'", "^whod$", |
82 | 94x |
"^whoever$", "^wholl$", "^whom$", "^whomever$", "^whos$", "^whose$", "^whosever$", "^whosoever$" |
83 |
), |
|
84 | 94x |
article = c("^a$", "^an$", "^da$", "^teh$", "^the$"), |
85 | 94x |
adverb = c( |
86 | 94x |
"^absolutely$", "^actively$", "^actually$", "^afk$", "^again$", "^ago$", "^ahead$", "^almost$", |
87 | 94x |
"^already$", "^altogether$", "^always$", "^angrily$", "^anxiously$", "^any$", "^anymore$", "^anyway$", |
88 | 94x |
"^anywhere$", "^apparently$", "^automatically$", "^away$", "^awhile$", "^back$", "^badly$", "^barely$", |
89 | 94x |
"^basically$", "^below$", "^brietermsy$", "^carefully$", "^causiously$", "^certainly$", "^clearly$", "^closely$", |
90 | 94x |
"^coldly$", "^commonly$", "^completely$", "^constantly$", "^continually$", "^correctly$", "^coz$", "^currently$", |
91 | 94x |
"^daily$", "^deeply$", "^definitely$", "^definitly$", "^deliberately$", "^desperately$", "^differently$", |
92 | 94x |
"^directly$", "^early$", "^easily$", "^effectively$", "^elsewhere$", "^enough$", "^entirely$", "^equally$", |
93 | 94x |
"^especially$", "^essentially$", "^etc$", "^even$", "^eventually$", "^ever$", "^every$", "^everyday$", |
94 | 94x |
"^everywhere", "^exactly$", "^exclusively$", "^extremely$", "^fairly$", "^far$", "^finally$", "^fortunately$", |
95 | 94x |
"^frequently$", "^fully$", "^further$", "^generally$", "^gently$", "^genuinely$", "^good$", "^greatly$", |
96 | 94x |
"^hardly$", "^heavily$", "^hence$", "^henceforth$", "^hereafter$", "^herein$", "^heretofore$", "^hesitantly$", |
97 | 94x |
"^highly$", "^hither$", "^hopefully$", "^hotly$", "^however$", "^immediately$", "^importantly$", "^increasingly$", |
98 | 94x |
"^incredibly$", "^indeed$", "^initially$", "^instead$", "^intensely$", "^jus$", "^just$", "^largely$", "^lately$", |
99 | 94x |
"^least$", "^legitimately$", "^less$", "^lightly$", "^likely$", "^literally$", "^loudly$", "^luckily$", |
100 | 94x |
"^mainly$", "^maybe$", "^meanwhile$", "^merely$", "^more$", "^moreover$", "^most$", "^mostly$", "^much$", |
101 | 94x |
"^namely$", "^naturally$", "^nearly$", "^necessarily$", "^nervously$", "^never$", "^nevertheless$", "^no$", |
102 | 94x |
"^nonetheless$", "^normally$", "^not$", "^notwithstanding$", "^obviously$", "^occasionally$", "^often$", "^once$", |
103 | 94x |
"^only$", "^originally$", "^otherwise$", "^overall$", "^particularly$", "^passionately$", "^perfectly$", |
104 | 94x |
"^perhaps$", "^personally$", "^physically$", "^please$", "^possibly$", "^potentially$", "^practically$", |
105 | 94x |
"^presently$", "^previously$", "^primarily$", "^probability$", "^probably$", "^profoundly$", "^prolly$", |
106 | 94x |
"^properly$", "^quickly$", "^quietly$", "^quite$", "^randomly$", "^rarely$", "^rather$", "^readily$", "^really$", |
107 | 94x |
"^recently$", "^regularly$", "^relatively$", "^respectively$", "^right$", "^roughly$", "^sadly$", "^seldomly$", |
108 | 94x |
"^seriously$", "^shortly$", "^significantly$", "^similarly$", "^simply$", "^slightly$", "^slowly$", "^so$", |
109 | 94x |
"^some$", "^somehow$", "^sometimes$", "^somewhat$", "^somewhere$", "^soon$", "^specifically$", "^still$", |
110 | 94x |
"^strongly$", "^subsequently$", "^successfully$", "^such$", "^suddenly$", "^supposedly$", "^surely$", |
111 | 94x |
"^surprisingly$", "^technically$", "^terribly$", "^thence$", "^thereafter$", "^therefor$", "^therefore$", |
112 | 94x |
"^thither$", "^thoroughly$", "^thus$", "^thusfar$", "^thusly$", "^together$", "^too$", "^totally$", "^truly$", |
113 | 94x |
"^typically$", "^ultimately$", "^uncommonly$", "^unfortunately$", "^unfortunatly$", "^usually$", "^vastly$", |
114 | 94x |
"^very$", "^virtually$", "^well$", "^whence$", "^where", "^wherefor", "^whither$", "^wholly$", "^why$", "^why'", |
115 | 94x |
"^whyd$", "^whys$", "^widely$", "^wither$", "^yet$" |
116 |
), |
|
117 | 94x |
conj = c( |
118 | 94x |
"^also$", "^altho$", "^although$", "^and$", "^b/c$", "^bc$", "^because$", "^besides$", "^both$", "^but$", |
119 | 94x |
"^'cause$", "^cos$", "^cuz$", "^either$", "^else$", "^except$", "^for$", "^how$", "^how'", "^howd$", "^howll$", |
120 | 94x |
"^hows$", "^if$", "^neither$", "^nor$", "^or$", "^than$", "^tho$", "^though$", "^unless$", "^unlike$", "^versus$", |
121 | 94x |
"^vs$", "^when$", "^when'", "^whenever$", "^whereas$", "^whether$", "^while$", "^whilst$" |
122 |
), |
|
123 | 94x |
prep = c( |
124 | 94x |
"^about$", "^above$", "^abt$", "^across$", "^acrost$", "^afk$", "^after$", "^against$", "^along$", "^amid", |
125 | 94x |
"^among", "^around$", "^as$", "^at$", "^atop$", "^before$", "^behind$", "^beneath$", "^beside$", "^betwe", |
126 | 94x |
"^beyond$", "^by$", "^despite$", "^down$", "^during$", "^excluding$", "^from$", "^here$", "^here'", "^heres$", |
127 | 94x |
"^in$", "^including$", "^inside$", "^into$", "^minus$", "^near$", "^now$", "^of$", "^off$", "^on$", "^onto$", |
128 | 94x |
"^out$", "^outside$", "^over$", "^plus$", "^regarding$", "^sans$", "^since$", "^then$", "^there$", "^there'", |
129 | 94x |
"^thered$", "^therell$", "^theres$", "^through$", "^throughout$", "^thru$", "^til$", "^till$", "^to$", "^toward", |
130 | 94x |
"^under$", "^underneath$", "^until$", "^untill$", "^unto$", "^up$", "^upon$", "^via$", "^with$", "^within$", |
131 | 94x |
"^without$", "^worth$" |
132 |
), |
|
133 | 94x |
auxverb = c( |
134 | 94x |
"^am$", "^are$", "^arent$", "^aren't$", "^be$", "^been$", "^bein$", "^being$", "^brb$", "^can$", |
135 | 94x |
"^could$", "^could'", "^couldnt$", "^couldn't$", "^couldve$", "^did$", "^didnt$", "^didn't$", "^do$", "^does$", |
136 | 94x |
"^doesnt$", "^doesn't$", "^doing$", "^dont$", "^don't$", "^had$", "^hadnt$", "^hadn't$", "^has$", "^hasnt$", |
137 | 94x |
"^hasn't$", "^have$", "^havent$", "^haven't$", "^having$", "^is$", "^isnt$", "^isn't$", "^may$", "^might$", |
138 | 94x |
"^might'", "^mightnt$", "^mightn't$", "^mightve$", "^must$", "^mustnt$", "^mustn't$", "^mustve$", "^ought", |
139 | 94x |
"^shant$", "^shan't$", "^sha'nt$", "^shall$", "^should$", "^shouldnt$", "^shouldn't$", "^shouldve$", "^was$", |
140 | 94x |
"^wasnt$", "^wasn't$", "^were$", "^werent$", "^weren't$", "^will$", "^would$", "^would'", "^wouldnt", "^wouldn't", |
141 | 94x |
"^wouldve$" |
142 |
), |
|
143 | 94x |
negate = c( |
144 | 94x |
"^ain't$", "^aint$", "^aren't$", "^arent$", "^can't$", "^cannot$", "^cant$", "^couldn't$", "^couldnt$", |
145 | 94x |
"^didn't$", "^didnt$", "^doesn't$", "^doesnt$", "^don't$", "^dont$", "^hadn't$", "^hadnt$", "^hasn't$", "^hasnt$", |
146 | 94x |
"^haven't$", "^havent$", "^idk$", "^isn't$", "^isnt$", "^must'nt$", "^mustn't$", "^mustnt$", "^nah", "^need'nt$", |
147 | 94x |
"^needn't$", "^neednt$", "^negat", "^neither$", "^never$", "^no$", "^nobod", "^noes$", "^none$", "^nope$", |
148 | 94x |
"^nor$", "^not$", "^nothing$", "^nowhere$", "^np$", "^ought'nt$", "^oughtn't$", "^oughtnt$", "^shant$", |
149 | 94x |
"^shan't$", "^sha'nt$", "^should'nt$", "^shouldn't$", "^shouldnt$", "^uh-uh$", "^wasn't$", "^wasnt$", "^weren't$", |
150 | 94x |
"^werent$", "^without$", "^won't$", "^wont$", "^wouldn't$", "^wouldnt$" |
151 |
), |
|
152 | 94x |
quant = c( |
153 | 94x |
"^add$", "^added$", "^adding$", "^adds$", "^all$", "^allot$", "^alot$", "^amount$", "^amounts$", |
154 | 94x |
"^another$", "^any$", "^approximat", "^average$", "^bit$", "^bits$", "^both$", "^bunch$", "^chapter$", "^couple$", |
155 | 94x |
"^doubl", "^each$", "^either$", "^entire", "^equal", "^every$", "^extra$", "^few$", "^fewer$", "^fewest$", |
156 | 94x |
"^group", "^inequal", "^least$", "^less$", "^lot$", "^lotof$", "^lots$", "^lotsa$", "^lotta$", "^majority$", |
157 | 94x |
"^many$", "^mo$", "^mo'", "^more$", "^most$", "^much$", "^mucho$", "^multiple$", "^nada$", "^none$", "^part$", |
158 | 94x |
"^partly$", "^percent", "^piece$", "^pieces$", "^plenty$", "^remaining$", "^sampl", "^scarce$", "^scarcer$", |
159 | 94x |
"^scarcest$", "^section$", "^segment", "^series$", "^several", "^single$", "^singles$", "^singly$", "^some$", |
160 | 94x |
"^somewhat$", "^ton$", "^tons$", "^total$", "^triple", "^tripling$", "^variety$", "^various$", "^whole$" |
161 |
), |
|
162 | 94x |
interrog = c( |
163 | 94x |
"^how$", "^how'd$", "^how're$", "^how's$", "^howd$", "^howre$", "^hows$", "^wat$", "^wattt", "^what$", |
164 | 94x |
"^what'd$", "^what'll$", "^what're$", "^what's$", "^whatd$", "^whatever$", "^whatll$", "^whatre$", "^whatt", |
165 | 94x |
"^when$", "^when'", "^whence$", "^whenever$", "^where$", "^where'd$", "^where's$", "^wherefore$", "^wherever$", |
166 | 94x |
"^whether$", "^which$", "^whichever$", "^whither$", "^who$", "^who'd$", "^who'll$", "^who's$", "^whoever$", |
167 | 94x |
"^wholl$", "^whom$", "^whomever$", "^whos$", "^whose$", "^whosever$", "^whoso", "^why$", "^why'", "^whyever$", |
168 | 94x |
"^wut$" |
169 |
), |
|
170 | 94x |
number = c( |
171 | 94x |
"^billion", "^doubl", "^dozen", "^eight", "^eleven$", "^fift", "^first$", "^firstly$", "^firsts$", |
172 | 94x |
"^five$", "^four", "^half$", "^hundred", "^infinit", "^million", "^nine", "^once$", "^one$", "^quarter", |
173 | 94x |
"^second$", "^seven", "^single$", "^six", "^ten$", "^tenth$", "^third$", "^thirt", "^thousand", "^three$", |
174 | 94x |
"^trillion", "^twel", "^twent", "^twice$", "^two$", "^zero$", "^zillion" |
175 |
), |
|
176 | 94x |
interjection = c( |
177 | 94x |
"^a+h+$", "^a+w+$", "^allas$", "^alright", "^anyhoo$", "^anyway[ysz]", "^bl[eh]+$", "^g+[eah]+$", |
178 | 94x |
"^h[ah]+$", "^h[hu]+$", "^h[mh]+$", "^l[ol]+$", "^m[hm]+$", "^meh$", "^o+h+$", "^o+k+$", "^okie", "^oo+f+$", |
179 | 94x |
"^soo+$", "^u[uh]+$", "^u+g+h+$", "^w[ow]+$", "^wee+ll+$", "^y[aes]+$", "^ya+h+$", "^yeah$", "^yus+$" |
180 |
), |
|
181 | 94x |
special = list( |
182 | 94x |
ELLIPSIS = "\\.{3, }|\\. +\\. +[. ]+", |
183 | 94x |
SMILE = "\\s(?:[[{(<qd]+[\\s<-]*[;:8=]|[;:8=][\\s>-]*[]})>Dpb]+|[uUnwWmM^=+-]_[uUnwWmM^=+-])(?=\\s)", |
184 | 94x |
FROWN = "\\s(?:[]D)}>]+[\\s.,<-]*[;:8=]|[;:8=][\\s.,>-]*[[{(<]+|[Tt:;]_[Tt;:]|[uUtT;:][mMn][uUtT;:])(?=\\s)", |
185 | 94x |
LIKE = c( |
186 | 94x |
"(?<=could not) like\\b", "(?<=did not) like\\b", "(?<=did) like\\b", "(?<=didn't) like\\b", |
187 | 94x |
"(?<=do not) like\\b", "(?<=do) like\\b", "(?<=does not) like\\b", "(?<=does) like\\b", "(?<=doesn't) like\\b", |
188 | 94x |
"(?<=don't) like\\b", "(?<=i) like\\b", "(?<=should not) like\\b", "(?<=they) like\\b", "(?<=we) like\\b", |
189 | 94x |
"(?<=will not) like\\b", "(?<=will) like\\b", "(?<=won't) like\\b", "(?<=would not) like\\b", |
190 | 94x |
"(?<=you) like\\b" |
191 |
), |
|
192 | 94x |
CHARACTERS = c( |
193 | 94x |
` ` = "\\s", |
194 | 94x |
`'` = paste0( |
195 | 94x |
"[\u00B4\u2018\u2019\u201A\u201B\u2032\u2035\u02B9\u02BB\u02BE\u02BF\u02C8\u02CA\u02CB\u02F4", |
196 | 94x |
"\u0300\u0301\u030D\u0312\u0313\u0314\u0315\u031B\u0321\u0322\u0326\u0328\u0329\u0340\u0341\u0343\u0351", |
197 | 94x |
"\u0357]" |
198 |
), |
|
199 | 94x |
`"` = "[\u201C\u201D\u201E\u201F\u2033\u2034\u2036\u2037\u2057\u02BA\u02DD\u02EE\u02F5\u02F6\u030B\u030F]", |
200 | 94x |
`...` = "\u2026", |
201 | 94x |
`-` = "[\u05BE\u1806\u2010\u2011\u2013\uFE58\uFE63\uFF0D]", |
202 | 94x |
` - ` = "[\u2012\u2014\u2015\u2E3A\u2E3B]|--+", |
203 | 94x |
a = paste0( |
204 | 94x |
"[\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u0100\u0101\u0102", |
205 | 94x |
"\u0103\u0104\u105\u0200\u0201\u0202\u0203\u0226\u0227\u0245\u0250\u0251\u0252\u0255\u0363\u0386\u0391", |
206 | 94x |
"\u0410\u0430]" |
207 |
), |
|
208 | 94x |
ae = "[\u00C6\u00E6\u0152\u0153\u0276]", |
209 | 94x |
b = paste0( |
210 | 94x |
"[\u00DF\u0180\u0181\u0182\u0183\u0184\u0185\u0186\u0187\u0188\u0189\u018A\u018B\u018C\u0243", |
211 | 94x |
"\u0253\u0299\u0411\u0412\u0431\u0432\u0462\u0463\u0494\u0495\u212C]" |
212 |
), |
|
213 | 94x |
c = paste0( |
214 | 94x |
"[\u00C7\u00E7\u0106\u0107\u0108\u0109\u0186\u0187\u0188\u0254\u0297\u0368\u0421\u0441\u2102", |
215 | 94x |
"\u2103]" |
216 |
), |
|
217 | 94x |
d = paste0( |
218 | 94x |
"[\u00D0\u00DE\u00FE\u010D\u010E\u010F\u0110\u0111\u0189\u0221\u0256\u0256\u0257\u0369\u0392", |
219 | 94x |
"\u0434\u0500\u2145\u2146]" |
220 |
), |
|
221 | 94x |
e = paste0( |
222 | 94x |
"[\u00C8\u00C9\u00CA\u00CB\u00E8\u00E9\u00EA\u00EB\u0112\u0113\u0114\u0115\u0116\u0117\u0118", |
223 | 94x |
"\u0119\u011A\u011B\u018E\u018F\u0190\u0204\u0205\u0206\u0207\u0228\u0229\u0246\u0247\u0258\u0259\u0364", |
224 | 94x |
"\u0388\u0395\u0400\u0401\u0404\u0415\u0417\u0435\u0437\u0450\u0451\u0454\u0498\u0499\u2107\u2108\u2128", |
225 | 94x |
"\u212E\u212F\u2130\u2147]" |
226 |
), |
|
227 | 94x |
f = "[\u0191\u0192\u0492\u0493\u2109\u2231\u2132\u214E]", |
228 | 94x |
g = "[\u011C\u011D\u011E\u011F\u0120\u0121\u0122\u0123\u0193\u0222\u0260\u0261\u0262\u210A\u2141]", |
229 | 94x |
h = "[\u0124\u0125\u0127\u0195\u0266\u0267\u0389\u0397\u0452\u210B\u210C\u210D\u210E\u210F]", |
230 | 94x |
i = paste0( |
231 | 94x |
"[\u00CC\u00CD\u00CE\u00CF\u00EC\u00ED\u00EE\u00EF\u0128\u0129\u012A\u012B\u012C\u012D\u012E\u012F", |
232 | 94x |
"\u0130\u0131\u0197\u019A\u0208\u0209\u0365\u0390\u0399\u0406\u0407\u0456\u0457]" |
233 |
), |
|
234 | 94x |
j = "[\u0135\u0236\u0237\u0248\u0249\u0408\u0458\u2129\u2139\u2149]", |
235 | 94x |
k = "[\u0137\u0138\u0198\u0199\u212A]", |
236 | 94x |
l = "[\u0139\u013A\u013B\u013C\u013D\u013E\u013F\u0140\u0141\u0142\u0234]", |
237 | 94x |
m = "[\u0271\u0460\u2133]", |
238 | 94x |
n = paste0( |
239 | 94x |
"[\u00D1\u00F1\u0143\u0144\u0145\u0146\u0147\u0148\u0149\u014A\u014B\u0220\u0235\u0272\u0273", |
240 | 94x |
"\u0274\u0376\u0377\u0418\u0419\u0438\u0439\u2115\u2135]" |
241 |
), |
|
242 | 94x |
h = "\u0149", |
243 | 94x |
o = paste0( |
244 | 94x |
"[\u00D2\u00D3\u00D4\u00D5\u00D6\u00D8\u00F0\u00F2\u00F3\u00F4\u00F5\u00F6\u00F8\u014C\u014D", |
245 | 94x |
"\u014E\u014F\u0150\u0151\u0150\u0151\u0230\u0231\u0275\u0298\u0366\u0398\u0424\u0444\u0472\u0473\u2134]" |
246 |
), |
|
247 | 94x |
p = "[\u0420\u0440\u2117\u2118\u2119]", |
248 | 94x |
q = "[\u018D\u211A\u213A]", |
249 | 94x |
r = paste0( |
250 | 94x |
"[\u0154\u0155\u0156\u0157\u0158\u0159\u0211\u0212\u0213\u0279\u0280\u0281\u0433\u0453\u0490", |
251 | 94x |
"\u0491\u211B\u211C\u211D\u211F\u213E]" |
252 |
), |
|
253 | 94x |
s = "[\u015A\u015C\u015D\u015E\u015F\u0160\u0161\u0160\u0161\u0218\u0219\u0405\u0455]", |
254 | 94x |
t = "[\u0162\u0163\u0164\u0165\u0166\u0167\u0371\u0373\u0422\u0442]", |
255 | 94x |
u = paste0( |
256 | 94x |
"[\u00D9\u00DA\u00DB\u00DC\u00F9\u00FA\u00FB\u00FC\u00FC\u0168\u0169\u016A\u016B\u016C\u016D", |
257 | 94x |
"\u016E\u016F\u0170\u0171\u0172\u0173\u01D3\u01D4\u01D5\u01D6\u01D7\u01D8\u01D9\u01DA\u01DB\u01DC\u0214", |
258 | 94x |
"\u0217\u0244\u0289\u0367\u0426\u0446]" |
259 |
), |
|
260 | 94x |
v = "[\u0474\u0475\u0476\u0477]", |
261 | 94x |
w = "[\u0174\u0175\u0270\u0428\u0429\u0448\u0449\u0461]", |
262 | 94x |
y = "[\u00DD\u00FD\u00FF\u0176\u0177\u0178\u0232\u0233\u0423\u0427\u0443\u0447]", |
263 | 94x |
z = "[\u0179\u017A\u017B\u017C\u017E\u0224\u0225\u0240\u0290\u0291\u0396\u2124]", |
264 | 94x |
x = "[\u00D7\u0416\u0425\u0436\u0445\u0496\u0497]" |
265 |
), |
|
266 | 94x |
SYMBOLS = c( |
267 | 94x |
`(cc)` = "\u00A9", |
268 | 94x |
number = "\u2116", |
269 | 94x |
sm = "\u2120", |
270 | 94x |
tel = "\u2121", |
271 | 94x |
`(tm)` = "\u2122", |
272 | 94x |
omega = "\u2126", |
273 | 94x |
alpha = "\u2127", |
274 | 94x |
fax = "\u213B", |
275 | 94x |
pi = "[\u213C\u213F]", |
276 | 94x |
sigma = "\u2140" |
277 |
) |
|
278 |
) |
|
279 |
) |
|
280 | 14x |
if (length(cats) == 0) cats <- names(dict)[-length(dict)] |
281 | 1x |
if (length(cats) == 1 && grepl("\\(|\\[", cats)) cats <- eval(parse(text = cats)) |
282 | 48x |
if (any(grepl("[0-9]|seq", cats))) cats <- if (length(cats) > 1) as.numeric(cats) else eval(parse(text = cats)) |
283 | 94x |
if (is.numeric(cats)) { |
284 | 48x |
cats <- cats[cats < length(dict)] |
285 | 1x |
} else if (any(!cats %in% names(dict))) cats <- grep(paste(paste0("^", cats), collapse = "|"), names(dict), value = TRUE) |
286 | 94x |
if (length(cats) == 0) { |
287 | ! |
stop( |
288 | ! |
"\n enter numbers between 1 and ", length(dict) - 1, |
289 | ! |
", or letters matching a category:\n ", paste(names(dict), collapse = ", ") |
290 |
) |
|
291 |
} |
|
292 | 31x |
if ("special" %in% cats) as.regex <- TRUE |
293 | 94x |
if (as.regex) { |
294 | 88x |
if (!missing(as.function)) { |
295 | 6x |
if ("special" %in% cats && is.function(as.function) && grepl("sub", substitute(as.function))) { |
296 | 1x |
dict <- c(dict$special$CHARACTERS, dict$special$SYMBOLS) |
297 | 1x |
fun <- as.function |
298 | 1x |
if (substitute(as.function) == "gsub") { |
299 | 1x |
charmap <- as.data.frame(unlist(lapply(as.list(dict), strsplit, "")), stringsAsFactors = FALSE) |
300 | 1x |
charmap <- data.frame(to = sub("[0-9]+", "", rownames(charmap)), from = charmap[[1]], stringsAsFactors = FALSE) |
301 | 1x |
charmap <- charmap[grepl("^\\w$", charmap$to) & !charmap$from %in% c("[", "]"), ] |
302 | 1x |
dict <- dict[!names(dict) %in% charmap$to] |
303 | 1x |
charmap <- list(to = paste(charmap$to, collapse = ""), from = paste(charmap$from, collapse = "")) |
304 |
} else { |
|
305 | ! |
charmap <- NULL |
306 |
} |
|
307 | 1x |
function(terms, ...) { |
308 | 1x |
args <- list(...) |
309 | 1x |
args$x <- terms |
310 | 1x |
if (!is.null(charmap)) { |
311 | 1x |
args$x <- tryCatch(chartr(charmap$from, charmap$to, args$x), error = function(e) NULL) |
312 | 1x |
if (is.null(args$x)) { |
313 | ! |
args$x <- chartr(charmap$from, charmap$to, iconv(terms, sub = "#")) |
314 | ! |
warning("the input appears to be misencoded; it was converted, but may have errant #s") |
315 |
} |
|
316 |
} |
|
317 | 1x |
for (s in names(dict)) { |
318 | 17x |
args$pattern <- dict[s] |
319 | 17x |
args$replacement <- s |
320 | 17x |
args$x <- do.call(fun, args) |
321 |
} |
|
322 | 1x |
args$x |
323 |
} |
|
324 |
} else { |
|
325 | 5x |
dict <- paste(unlist(dict[cats]), collapse = "|") |
326 | 5x |
fun <- if (is.function(as.function)) as.function else grepl |
327 | 5x |
function(terms, ...) { |
328 | 11x |
args <- list(...) |
329 | 11x |
args$pattern <- dict |
330 | 11x |
args$x <- terms |
331 | 11x |
if (!is.function(as.function) && !"perl" %in% names(args)) args$perl <- TRUE |
332 | 11x |
do.call(fun, args) |
333 |
} |
|
334 |
} |
|
335 |
} else { |
|
336 | 82x |
dict[cats] |
337 |
} |
|
338 |
} else { |
|
339 | 6x |
lapply(dict[cats], function(l) gsub("\\^|\\$", "", sub("(?<=[^$])$", "*", l, perl = TRUE))) |
340 |
} |
|
341 |
} |
1 |
#' Calculate Text-Based Metastatistics |
|
2 |
#' |
|
3 |
#' Calculate simple descriptive statistics from text. |
|
4 |
#' |
|
5 |
#' @param text A character vector of texts. |
|
6 |
#' @return A data.frame: |
|
7 |
#' \itemize{ |
|
8 |
#' \item \strong{\code{characters}}: Total number of characters. |
|
9 |
#' \item \strong{\code{syllables}}: Total number of syllables, as estimated by split length of \cr |
|
10 |
#' \code{'a+[eu]*|e+a*|i+|o+[ui]*|u+|y+[aeiou]*'} - 1. |
|
11 |
#' \item \strong{\code{words}}: Total number of words (raw word count). |
|
12 |
#' \item \strong{\code{unique_words}}: Number of unique words (binary word count). |
|
13 |
#' \item \strong{\code{clauses}}: Number of clauses, as marked by commas, colons, semicolons, dashes, or brackets |
|
14 |
#' within sentences. |
|
15 |
#' \item \strong{\code{sentences}}: Number of sentences, as marked by periods, question marks, exclamation points, |
|
16 |
#' or new line characters. |
|
17 |
#' \item \strong{\code{words_per_clause}}: Average number of words per clause. |
|
18 |
#' \item \strong{\code{words_per_sentence}}: Average number of words per sentence. |
|
19 |
#' \item \strong{\code{sixltr}}: Number of words 6 or more characters long. |
|
20 |
#' \item \strong{\code{characters_per_word}}: Average number of characters per word |
|
21 |
#' (\code{characters} / \code{words}). |
|
22 |
#' \item \strong{\code{syllables_per_word}}: Average number of syllables per word |
|
23 |
#' (\code{syllables} / \code{words}). |
|
24 |
#' \item \strong{\code{type_token_ratio}}: Ratio of unique to total words: \code{unique_words} / \code{words}. |
|
25 |
#' \item \strong{\code{reading_grade}}: Flesch-Kincaid grade level: .39 * \code{words} / \code{sentences} + |
|
26 |
#' 11.8 * \code{syllables} / \code{words} - 15.59. |
|
27 |
#' \item \strong{\code{numbers}}: Number of terms starting with numbers. |
|
28 |
#' \item \strong{\code{punct}}: Number of terms starting with non-alphanumeric characters. |
|
29 |
#' \item \strong{\code{periods}}: Number of periods. |
|
30 |
#' \item \strong{\code{commas}}: Number of commas. |
|
31 |
#' \item \strong{\code{qmarks}}: Number of question marks. |
|
32 |
#' \item \strong{\code{exclams}}: Number of exclamation points. |
|
33 |
#' \item \strong{\code{quotes}}: Number of quotation marks (single and double). |
|
34 |
#' \item \strong{\code{apostrophes}}: Number of apostrophes, defined as any modified letter apostrophe, or backtick |
|
35 |
#' or single straight or curly quote surrounded by letters. |
|
36 |
#' \item \strong{\code{brackets}}: Number of bracketing characters (including parentheses, and square, |
|
37 |
#' curly, and angle brackets). |
|
38 |
#' \item \strong{\code{orgmarks}}: Number of characters used for organization or structuring (including |
|
39 |
#' dashes, foreword slashes, colons, and semicolons). |
|
40 |
#' } |
|
41 |
#' @examples |
|
42 |
#' text <- c( |
|
43 |
#' succinct = "It is here.", |
|
44 |
#' verbose = "Hear me now. I shall tell you about it. It is here. Do you hear?", |
|
45 |
#' couched = "I might be wrong, but it seems to me that it might be here.", |
|
46 |
#' bigwords = "Object located thither.", |
|
47 |
#' excited = "It's there! It's there! It's there!", |
|
48 |
#' drippy = "It's 'there', right? Not 'here'? 'there'? Are you Sure?", |
|
49 |
#' struggly = "It's here -- in that place where it is. Like... the 1st place (here)." |
|
50 |
#' ) |
|
51 |
#' lma_meta(text) |
|
52 |
#' @export |
|
53 | ||
54 |
lma_meta <- function(text) { |
|
55 | 11x |
text <- gsub("^\\s+|\\s+$", "", text) |
56 | 11x |
dtm <- lma_dtm(text, numbers = TRUE, punct = TRUE, urls = FALSE) |
57 | 11x |
text <- gsub(paste0( |
58 | 11x |
"((?:^|\\s)[a-z]+\\.[a-z.]+|\\d|(?:^|\\s)[a-z]|(?:^|\\s)[iv]+|", |
59 | 11x |
"ans|govt|apt|etc|st|rd|ft|feat|dr|drs|mr|ms|mrs|messrs|jr|prof)\\." |
60 | 11x |
), "", text, TRUE) |
61 | 11x |
terms <- colnames(dtm) |
62 | 11x |
dwm <- dtm[, grepl("^[a-z']", terms), drop = FALSE] |
63 | 11x |
words <- colnames(dwm) |
64 | 11x |
word_lengths <- nchar(words) |
65 | 11x |
word_syllables <- vapply(strsplit(words, "a+[eu]*|e+a*|i+|o+[ui]*|u+|y+[aeiou]*"), length, 0) - 1 |
66 | 11x |
word_syllables[word_syllables == 0] <- 1 |
67 | 11x |
res <- data.frame( |
68 | 11x |
characters = as.numeric(dwm %*% word_lengths), |
69 | 11x |
syllables = as.numeric(dwm %*% word_syllables), |
70 | 11x |
words = rowSums(dwm), |
71 | 11x |
unique_words = rowSums(dwm != 0), |
72 | 11x |
clauses = vapply(strsplit(text, '([.?!\n,:;)}>-]|\\])([.?!\n,:;)}>\n\'"-]|\\s|\\])*'), length, 0), |
73 | 11x |
sentences = vapply(strsplit(text, '[.?!\n]([.?!\n\'"]|\\s)*'), length, 0), |
74 | 11x |
stringsAsFactors = FALSE |
75 |
) |
|
76 | 11x |
cbind(res, with(res, data.frame( |
77 | 11x |
words_per_clause = words / clauses, |
78 | 11x |
words_per_sentence = words / sentences, |
79 | 11x |
sixltr = as.numeric(dwm %*% (word_lengths > 5)), |
80 | 11x |
characters_per_word = characters / words, |
81 | 11x |
syllables_per_word = syllables / words, |
82 | 11x |
type_token_ratio = unique_words / words, |
83 | 11x |
reading_grade = .39 * words / sentences + 11.8 * syllables / words - 15.59, |
84 | 11x |
numbers = if (any(su <- grepl("^[0-9]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0, |
85 | 11x |
puncts = if (any(su <- grepl("^[^a-z0-9]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0, |
86 | 11x |
periods = if ("." %in% terms) dtm[, "."] else 0, |
87 | 11x |
commas = if ("," %in% terms) dtm[, ","] else 0, |
88 | 11x |
qmarks = if ("?" %in% terms) dtm[, "?"] else 0, |
89 | 11x |
exclams = if ("!" %in% terms) dtm[, "!"] else 0, |
90 | 11x |
quotes = if (any(su <- grepl('^[\'"]', terms))) rowSums(dtm[, su, drop = FALSE]) else 0, |
91 | 11x |
apostrophes = vapply(strsplit(text, "[\u02bc]+|[a-zA-Z][\u0027\u0060\u2019]+[a-zA-Z]"), length, 0) - 1, |
92 | 11x |
brackets = if (any(su <- grepl("[(\\)<>{\\}[]|\\]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0, |
93 | 11x |
orgmarks = if (any(su <- grepl("[/:;-]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0, |
94 | 11x |
row.names = seq_len(nrow(res)), stringsAsFactors = FALSE |
95 |
))) |
|
96 |
} |
1 |
#' Document-Term Matrix Weighting |
|
2 |
#' |
|
3 |
#' Weight a document-term matrix. |
|
4 |
#' @param dtm A matrix with words as column names. |
|
5 |
#' @param weight A string referring at least partially to one (or a combination; see note) of the |
|
6 |
#' available weighting methods: |
|
7 |
#' |
|
8 |
#' \strong{Term weights} (applied uniquely to each cell) |
|
9 |
#' \itemize{ |
|
10 |
#' \item \strong{\code{binary}} \cr |
|
11 |
#' \code{(dtm > 0) * 1} \cr |
|
12 |
#' Convert frequencies to 1s and 0s; remove differences in frequencies. |
|
13 |
#' |
|
14 |
#' \item \strong{\code{log}} \cr |
|
15 |
#' \code{log(dtm + 1, log.base)} \cr |
|
16 |
#' Log of frequencies. |
|
17 |
#' |
|
18 |
#' \item \strong{\code{sqrt}} \cr |
|
19 |
#' \code{sqrt(dtm)} \cr |
|
20 |
#' Square root of frequencies. |
|
21 |
#' |
|
22 |
#' \item \strong{\code{count}} \cr |
|
23 |
#' \code{dtm} \cr |
|
24 |
#' Unaltered; sometimes called term frequencies (tf). |
|
25 |
#' |
|
26 |
#' \item \strong{\code{amplify}} \cr |
|
27 |
#' \code{dtm ^ alpha} \cr |
|
28 |
#' Amplify difference in frequencies. |
|
29 |
#' } |
|
30 |
#' |
|
31 |
#' \strong{Document weights} (applied by column) |
|
32 |
#' \itemize{ |
|
33 |
#' \item \strong{\code{dflog}} \cr |
|
34 |
#' \code{log(colSums(dtm > 0), log.base)} \cr |
|
35 |
#' Log of binary term sum. |
|
36 |
#' |
|
37 |
#' \item \strong{\code{entropy}} \cr |
|
38 |
#' \code{1 - rowSums(x *} \code{log(x + 1, log.base) /} \code{log(ncol(x), log.base),} \code{na.rm = TRUE)} \cr |
|
39 |
#' Where \code{x = t(dtm) / colSums(dtm > 0)}; entropy of term-conditional term distribution. |
|
40 |
#' |
|
41 |
#' \item \strong{\code{ppois}} \cr |
|
42 |
#' \code{1 - ppois(pois.x,} \code{colSums(dtm) / nrow(dtm))} \cr |
|
43 |
#' Poisson-predicted term distribution. |
|
44 |
#' |
|
45 |
#' \item \strong{\code{dpois}} \cr |
|
46 |
#' \code{1 - dpois(pois.x, colSums(dtm) / nrow(dtm))} \cr |
|
47 |
#' Poisson-predicted term density. |
|
48 |
#' |
|
49 |
#' \item \strong{\code{dfmlog}} \cr |
|
50 |
#' \code{log(diag(dtm[max.col(t(dtm)), ]), log.base)} \cr |
|
51 |
#' Log of maximum term frequency. |
|
52 |
#' |
|
53 |
#' \item \strong{\code{dfmax}} \cr |
|
54 |
#' \code{diag(dtm[max.col(t(dtm)), ])} \cr |
|
55 |
#' Maximum term frequency. |
|
56 |
#' |
|
57 |
#' \item \strong{\code{df}} \cr |
|
58 |
#' \code{colSums(dtm > 0)} \cr |
|
59 |
#' Sum of binary term occurrence across documents. |
|
60 |
#' |
|
61 |
#' \item \strong{\code{idf}} \cr |
|
62 |
#' \code{log(nrow(dtm) / colSums(dtm > 0), log.base)} \cr |
|
63 |
#' Inverse document frequency. |
|
64 |
#' |
|
65 |
#' \item \strong{\code{ridf}} \cr |
|
66 |
#' \code{idf - log(dpois, log.base)} \cr |
|
67 |
#' Residual inverse document frequency. |
|
68 |
#' |
|
69 |
#' \item \strong{\code{normal}} \cr |
|
70 |
#' \code{sqrt(1 / colSums(dtm ^ 2))} \cr |
|
71 |
#' Normalized document frequency. |
|
72 |
#' } |
|
73 |
#' |
|
74 |
#' Alternatively, \code{'pmi'} or \code{'ppmi'} will apply a pointwise mutual information weighting |
|
75 |
#' scheme (with \code{'ppmi'} setting negative values to 0). |
|
76 |
#' @param normalize Logical: if \code{FALSE}, the dtm is not divided by document word-count before |
|
77 |
#' being weighted. |
|
78 |
#' @param wc.complete If the dtm was made with \code{\link{lma_dtm}} (has a \code{'WC'} |
|
79 |
#' attribute), word counts for |
|
80 |
#' frequencies can be based on the raw count (default; \code{wc.complete = TRUE}). If |
|
81 |
#' \code{wc.complete = FALSE}, or the dtm does not have a \code{'WC'} attribute, |
|
82 |
#' \code{rowSums(dtm)} is used as word count. |
|
83 |
#' @param log.base The base of logs, applied to any weight using \code{\link[base]{log}}. |
|
84 |
#' Default is 10. |
|
85 |
#' @param alpha A scaling factor applied to document frequency as part of pointwise mutual |
|
86 |
#' information weighting, or amplify's power (\code{dtm ^ alpha}, which defaults to 1.1). |
|
87 |
#' @param pois.x integer; quantile or probability of the poisson distribution (\code{dpois(pois.x, |
|
88 |
#' colSums(x,} \code{na.rm = TRUE) / nrow(x))}). |
|
89 |
#' @param doc.only Logical: if \code{TRUE}, only document weights are returned (a single value for |
|
90 |
#' each term). |
|
91 |
#' @param percent Logical; if \code{TRUE}, frequencies are multiplied by 100. |
|
92 |
#' @note |
|
93 |
#' Term weights works to adjust differences in counts within documents, with differences meaning |
|
94 |
#' increasingly more from \code{binary} to \code{log} to \code{sqrt} to \code{count} to \code{amplify}. |
|
95 |
#' |
|
96 |
#' Document weights work to treat words differently based on their between-document or overall frequency. |
|
97 |
#' When term frequencies are constant, \code{dpois}, \code{idf}, \code{ridf}, and \code{normal} give |
|
98 |
#' less common words increasingly more weight, and \code{dfmax}, \code{dfmlog}, \code{ppois}, \code{df}, |
|
99 |
#' \code{dflog}, and \code{entropy} give less common words increasingly less weight. |
|
100 |
#' |
|
101 |
#' \code{weight} can either be a vector with two characters, corresponding to term weight and |
|
102 |
#' document weight (e.g., \code{c('count', 'idf')}), or it can be a string with term and |
|
103 |
#' document weights separated by any of \code{:\\*_/; ,-} (e.g., \code{'count-idf'}). |
|
104 |
#' \code{'tf'} is also acceptable for \code{'count'}, and \code{'tfidf'} will be parsed as |
|
105 |
#' \code{c('count', 'idf')}, though this is a special case. |
|
106 |
#' |
|
107 |
#' For \code{weight}, term or document weights can be entered individually; term weights alone will |
|
108 |
#' not apply any document weight, and document weights alone will apply a \code{'count'} term weight |
|
109 |
#' (unless \code{doc.only = TRUE}, in which case a term-named vector of document weights is returned |
|
110 |
#' instead of a weighted dtm). |
|
111 |
#' @return A weighted version of \code{dtm}, with a \code{type} attribute added (\code{attr(dtm, 'type')}). |
|
112 |
#' @examples |
|
113 |
#' # visualize term and document weights |
|
114 |
#' |
|
115 |
#' ## term weights |
|
116 |
#' term_weights <- c("binary", "log", "sqrt", "count", "amplify") |
|
117 |
#' Weighted <- sapply(term_weights, function(w) lma_weight(1:20, w, FALSE)) |
|
118 |
#' if (require(splot)) splot(Weighted ~ 1:20, labx = "Raw Count", lines = "co") |
|
119 |
#' |
|
120 |
#' ## document weights |
|
121 |
#' doc_weights <- c( |
|
122 |
#' "df", "dflog", "dfmax", "dfmlog", "idf", "ridf", |
|
123 |
#' "normal", "dpois", "ppois", "entropy" |
|
124 |
#' ) |
|
125 |
#' weight_range <- function(w, value = 1) { |
|
126 |
#' m <- diag(20) |
|
127 |
#' m[upper.tri(m, TRUE)] <- if (is.numeric(value)) { |
|
128 |
#' value |
|
129 |
#' } else { |
|
130 |
#' unlist(lapply( |
|
131 |
#' 1:20, function(v) rep(if (value == "inverted") 21 - v else v, v) |
|
132 |
#' )) |
|
133 |
#' } |
|
134 |
#' lma_weight(m, w, FALSE, doc.only = TRUE) |
|
135 |
#' } |
|
136 |
#' |
|
137 |
#' if (require(splot)) { |
|
138 |
#' category <- rep(c("df", "idf", "normal", "poisson", "entropy"), c(4, 2, 1, 2, 1)) |
|
139 |
#' op <- list( |
|
140 |
#' laby = "Relative (Scaled) Weight", labx = "Document Frequency", |
|
141 |
#' leg = "outside", lines = "connected", mv.scale = TRUE, note = FALSE |
|
142 |
#' ) |
|
143 |
#' splot( |
|
144 |
#' sapply(doc_weights, weight_range) ~ 1:20, |
|
145 |
#' options = op, title = "Same Term, Varying Document Frequencies", |
|
146 |
#' sud = "All term frequencies are 1.", |
|
147 |
#' colorby = list(category, grade = TRUE) |
|
148 |
#' ) |
|
149 |
#' splot( |
|
150 |
#' sapply(doc_weights, weight_range, value = "sequence") ~ 1:20, |
|
151 |
#' options = op, title = "Term as Document Frequencies", |
|
152 |
#' sud = "Non-zero terms are the number of non-zero terms.", |
|
153 |
#' colorby = list(category, grade = TRUE) |
|
154 |
#' ) |
|
155 |
#' splot( |
|
156 |
#' sapply(doc_weights, weight_range, value = "inverted") ~ 1:20, |
|
157 |
#' options = op, title = "Term Opposite of Document Frequencies", |
|
158 |
#' sud = "Non-zero terms are the number of zero terms + 1.", |
|
159 |
#' colorby = list(category, grade = TRUE) |
|
160 |
#' ) |
|
161 |
#' } |
|
162 |
#' |
|
163 |
#' @export |
|
164 | ||
165 |
lma_weight <- function(dtm, weight = "count", normalize = TRUE, wc.complete = TRUE, |
|
166 |
log.base = 10, alpha = 1, pois.x = 1L, doc.only = FALSE, percent = FALSE) { |
|
167 | ! |
if (is.null(dim(dtm))) dtm <- if (is.character(dtm) || is.factor(dtm)) lma_dtm(dtm) else matrix(dtm, 1) |
168 | 72x |
ck <- attr(dtm, "type") |
169 | 72x |
if (!is.null(ck) && length(ck) == 3 && (ck[1] == "TRUE" || ck[2] != "count" || ck[3] != "NA")) { |
170 | ! |
message( |
171 | ! |
"the entered dtm appears to already be weighted (", paste(ck[2:3], collapse = "*"), |
172 | ! |
"), so it will not be altered" |
173 |
) |
|
174 | ! |
return(dtm) |
175 |
} |
|
176 | 72x |
weight <- tolower(weight) |
177 | 3x |
if (missing(normalize) && any(grepl("pmi", weight))) normalize <- FALSE |
178 | 72x |
if (normalize) { |
179 | 45x |
wc <- attr(dtm, "WC") |
180 | 11x |
if (is.null(wc) || !wc.complete || nrow(dtm) != length(wc)) wc <- rowSums(dtm, na.rm = TRUE) |
181 | 45x |
adj <- if (percent) 100 else 1 |
182 | 45x |
if (.hasSlot(dtm, "x") && .hasSlot(dtm, "i")) { |
183 | 40x |
wc <- wc[dtm@i + 1] |
184 | 40x |
su <- wc != 0 |
185 | 40x |
dtm@x[su] <- dtm@x[su] / wc[su] * adj |
186 |
} else { |
|
187 | 5x |
su <- wc != 0 |
188 | 5x |
dtm[su, ] <- dtm[su, ] / wc[su] * adj |
189 |
} |
|
190 |
} |
|
191 | 72x |
nr <- nrow(dtm) |
192 | 72x |
if (any(grepl("pmi", weight))) { |
193 | 3x |
tw <- dw <- "pmi" |
194 | 3x |
if (missing(log.base)) log.base <- 2 |
195 | 3x |
twc <- sum(dtm, na.rm = TRUE) |
196 | 3x |
pc <- matrix(colSums(dtm^alpha, na.rm = TRUE) / twc^alpha, 1) |
197 | 3x |
dtm <- dtm / twc |
198 | 3x |
dtm <- dtm / rowSums(dtm, na.rm = TRUE) %*% pc |
199 | 3x |
if (.hasSlot(dtm, "x")) { |
200 | 1x |
dtm@x <- log(dtm@x, base = log.base) |
201 | 1x |
dtm@x[!is.finite(dtm@x)] <- 0 |
202 |
} else { |
|
203 | 2x |
dtm <- log(dtm, base = log.base) |
204 | 2x |
dtm[!is.finite(dtm)] <- 0 |
205 |
} |
|
206 | 3x |
if (any(grepl("pp", weight))) { |
207 | 1x |
tw <- dw <- "ppmi" |
208 | 1x |
dtm[dtm < 0] <- 0 |
209 |
} |
|
210 |
} else { |
|
211 | 69x |
term <- function(x, type) { |
212 | 56x |
switch(type, |
213 | 1x |
binary = (x > 0) * 1, |
214 | 1x |
log = log(x + 1, base = log.base), |
215 | 1x |
sqrt = sqrt(x), |
216 | 52x |
count = x, |
217 | 1x |
amplify = x^alpha |
218 |
) |
|
219 |
} |
|
220 | 69x |
doc <- function(x, type) { |
221 | 27x |
d <- switch(type, |
222 | 27x |
df = colSums(x > 0, na.rm = TRUE), |
223 | 27x |
dflog = log(colSums(x > 0, na.rm = TRUE), base = log.base), |
224 | 27x |
dfmax = diag(x[max.col(t(x)), ]), |
225 | 27x |
dfmlog = log(diag(x[max.col(t(x)), ]), base = log.base), |
226 | 27x |
idf = log(nrow(x) / colSums(x > 0, na.rm = TRUE), base = log.base), |
227 | 27x |
normal = sqrt(1 / colSums(x^2, na.rm = TRUE)), |
228 | 27x |
dpois = 1 - dpois(pois.x, colSums(x, na.rm = TRUE) / nrow(x)), |
229 | 27x |
ppois = 1 - ppois(pois.x, colSums(x, na.rm = TRUE) / nrow(x)), |
230 | 27x |
ridf = doc(x, "idf") - log(doc(x, "dpois"), base = log.base), |
231 | 27x |
entropy = { |
232 | 1x |
x <- t(x) / colSums(x > 0, na.rm = TRUE) |
233 | 1x |
1 - rowSums(x * log(x + 1, base = log.base) / |
234 | 1x |
log(ncol(x), base = log.base), na.rm = TRUE) |
235 |
} |
|
236 |
) |
|
237 | 27x |
d[!is.finite(d)] <- 0 |
238 | 27x |
d |
239 |
} |
|
240 | 69x |
if (length(weight) == 1) { |
241 | 69x |
weight <- strsplit(weight, " *[:\\*_/; ,-] *")[[1]] |
242 | 9x |
if (length(weight) == 1 && weight == "tfidf") weight <- c("count", "idf") |
243 |
} |
|
244 | 1x |
if (grepl("^(?:t|na|non|f)", weight[1])) weight[1] <- "count" |
245 | 69x |
tws <- c("binary", "log", "sqrt", "count", "amplify") |
246 | 69x |
tw <- if (weight[1] == "") "count" else grep(substr(weight[1], 0, 4), tws, value = TRUE)[1] |
247 | 69x |
pdw <- TRUE |
248 | 69x |
dws <- c("df", "dflog", "dfmax", "dfmlog", "idf", "normal", "dpois", "ppois", "ridf", "entropy") |
249 | 69x |
if (is.na(tw)) { |
250 | 15x |
tw <- grep(substr(weight[1], 0, 4), dws, value = TRUE)[1] |
251 | 15x |
if (!is.na(tw)) { |
252 | 15x |
pdw <- FALSE |
253 | 15x |
if (!doc.only) { |
254 | 2x |
dw <- tw |
255 | 2x |
tw <- "count" |
256 |
} else { |
|
257 | 13x |
return(doc(dtm, tw)) |
258 |
} |
|
259 |
} else { |
|
260 | ! |
stop(paste(weight, collapse = " * "), " is not a recognized weight", call. = FALSE) |
261 |
} |
|
262 |
} |
|
263 | 54x |
if (pdw) dw <- if (length(weight) > 1) grep(substr(weight[2], 0, 4), dws, value = TRUE)[1] else "none" |
264 | ! |
if (is.na(dw)) dw <- "none" |
265 | 1x |
if (missing(alpha) && tw == "amplify") alpha <- 1.1 |
266 | 56x |
dtm <- if (dw == "none") term(dtm, tw) else term(dtm, tw) * rep(doc(dtm, dw), each = nr) |
267 |
} |
|
268 | 59x |
attr(dtm, "type") <- c(normalized = normalize, term = tw, document = dw) |
269 | 59x |
dtm |
270 |
} |
1 |
#' Latent Semantic Space (Embeddings) Operations |
|
2 |
#' |
|
3 |
#' Map a document-term matrix onto a latent semantic space, extract terms from a |
|
4 |
#' latent semantic space (if \code{dtm} is a character vector, or \code{map.space =} \code{FALSE}), |
|
5 |
#' or perform a singular value decomposition of a document-term matrix (if \code{dtm} is a matrix |
|
6 |
#' and \code{space} is missing). |
|
7 |
#' @param dtm A matrix with terms as column names, or a character vector of terms to be extracted |
|
8 |
#' from a specified space. If this is of length 1 and \code{space} is missing, it will be treated |
|
9 |
#' as \code{space}. |
|
10 |
#' @param space A matrix with terms as rownames. If missing, this will be the right singular vectors |
|
11 |
#' of a singular value decomposition of \code{dtm}. If a character, a file matching the character |
|
12 |
#' will be searched for in \code{dir} (e.g., \code{space = 'google'}). If a file is not found and |
|
13 |
#' the character matches one of the \href{https://osf.io/489he/wiki/home}{available spaces}, you |
|
14 |
#' will be given the option to download it, as handled by \code{\link{download.lspace}}. |
|
15 |
#' If \code{dtm} is missing, the entire space will be loaded and returned. |
|
16 |
#' @param map.space Logical: if \code{FALSE}, the original vectors of \code{space} for terms |
|
17 |
#' found in \code{dtm} are returned. Otherwise \code{dtm} \code{\%*\%} \code{space} is returned, |
|
18 |
#' excluding uncommon columns of \code{dtm} and rows of \code{space}. |
|
19 |
#' @param fill.missing Logical: if \code{TRUE} and terms are being extracted from a space, includes |
|
20 |
#' terms not found in the space as rows of 0s, such that the returned matrix will have a row |
|
21 |
#' for every requested term. |
|
22 |
#' @param term.map A matrix with \code{space} as a column name, terms as row names, and indices of |
|
23 |
#' the terms in the given space as values, or a numeric vector of indices with terms as names, or |
|
24 |
#' a character vector of terms corresponding to rows of the space. This is used instead of reading |
|
25 |
#' in an "_terms.txt" file corresponding to a \code{space} entered as a character (the name of a |
|
26 |
#' space file). |
|
27 |
#' @param dim.cutoff If a \code{space} is calculated, this will be used to decide on the number of |
|
28 |
#' dimensions to be retained: \code{cumsum(d) / sum(d) < dim.cutoff}, where \code{d} is a vector |
|
29 |
#' of singular values of \code{dtm} (i.e., \code{svd(dtm)$d}). The default is \code{.5}; lower |
|
30 |
#' cutoffs result in fewer dimensions. |
|
31 |
#' @param keep.dim Logical: if \code{TRUE}, and a space is being calculated from the input, a matrix |
|
32 |
#' in the same dimensions as \code{dtm} is returned. Otherwise, a matrix with terms as rows and |
|
33 |
#' dimensions as columns is returned. |
|
34 |
#' @param use.scan Logical: if \code{TRUE}, reads in the rows of \code{space} with \code{\link{scan}}. |
|
35 |
#' @param dir Path to a folder containing spaces. \cr |
|
36 |
#' Set a session default with \code{options(lingmatch.lspace.dir = 'desired/path')}. |
|
37 |
#' @note |
|
38 |
#' A traditional latent semantic space is a selection of right singular vectors from the singular |
|
39 |
#' value decomposition of a dtm (\code{svd(dtm)$v[, 1:k]}, where \code{k} is the selected number of |
|
40 |
#' dimensions, decided here by \code{dim.cutoff}). |
|
41 |
#' |
|
42 |
#' Mapping a new dtm into a latent semantic space consists of multiplying common terms: |
|
43 |
#' \code{dtm[, ct]} \code{\%*\% space[ct, ]}, where \code{ct} \code{=} \code{colnames(dtm)[colnames(dtm)} |
|
44 |
#' \code{\%in\%} \code{rownames(space)]} -- the terms common between the dtm and the space. This |
|
45 |
#' results in a matrix with documents as rows, and dimensions as columns, replacing terms. |
|
46 |
#' @family Latent Semantic Space functions |
|
47 |
#' @return A matrix or sparse matrix with either (a) a row per term and column per latent dimension (a latent |
|
48 |
#' space, either calculated from the input, or retrieved when \code{map.space = FALSE}), (b) a row per document |
|
49 |
#' and column per latent dimension (when a dtm is mapped to a space), or (c) a row per document and |
|
50 |
#' column per term (when a space is calculated and \code{keep.dim = TRUE}). |
|
51 |
#' @examples |
|
52 |
#' text <- c( |
|
53 |
#' paste( |
|
54 |
#' "Hey, I like kittens. I think all kinds of cats really are just the", |
|
55 |
#' "best pet ever." |
|
56 |
#' ), |
|
57 |
#' paste( |
|
58 |
#' "Oh year? Well I really like cars. All the wheels and the turbos...", |
|
59 |
#' "I think that's the best ever." |
|
60 |
#' ), |
|
61 |
#' paste( |
|
62 |
#' "You know what? Poo on you. Cats, dogs, rabbits -- you know, living", |
|
63 |
#' "creatures... to think you'd care about anything else!" |
|
64 |
#' ), |
|
65 |
#' paste( |
|
66 |
#' "You can stick to your opinion. You can be wrong if you want. You know", |
|
67 |
#' "what life's about? Supercharging, diesel guzzling, exhaust spewing,", |
|
68 |
#' "piston moving ignitions." |
|
69 |
#' ) |
|
70 |
#' ) |
|
71 |
#' |
|
72 |
#' dtm <- lma_dtm(text) |
|
73 |
#' |
|
74 |
#' # calculate a latent semantic space from the example text |
|
75 |
#' lss <- lma_lspace(dtm) |
|
76 |
#' |
|
77 |
#' # show that document similarities between the truncated and full space are the same |
|
78 |
#' spaces <- list( |
|
79 |
#' full = lma_lspace(dtm, keep.dim = TRUE), |
|
80 |
#' truncated = lma_lspace(dtm, lss) |
|
81 |
#' ) |
|
82 |
#' sapply(spaces, lma_simets, metric = "cosine") |
|
83 |
#' |
|
84 |
#' \dontrun{ |
|
85 |
#' |
|
86 |
#' # specify a directory containing spaces, |
|
87 |
#' # or where you would like to download spaces |
|
88 |
#' space_dir <- "~/Latent Semantic Spaces" |
|
89 |
#' |
|
90 |
#' # map to a pretrained space |
|
91 |
#' ddm <- lma_lspace(dtm, "100k", dir = space_dir) |
|
92 |
#' |
|
93 |
#' # load the matching subset of the space |
|
94 |
#' # without mapping |
|
95 |
#' lss_100k_part <- lma_lspace(colnames(dtm), "100k", dir = space_dir) |
|
96 |
#' |
|
97 |
#' ## or |
|
98 |
#' lss_100k_part <- lma_lspace(dtm, "100k", map.space = FALSE, dir = space_dir) |
|
99 |
#' |
|
100 |
#' # load the full space |
|
101 |
#' lss_100k <- lma_lspace("100k", dir = space_dir) |
|
102 |
#' |
|
103 |
#' ## or |
|
104 |
#' lss_100k <- lma_lspace(space = "100k", dir = space_dir) |
|
105 |
#' } |
|
106 |
#' @export |
|
107 | ||
108 |
lma_lspace <- function(dtm = "", space, map.space = TRUE, fill.missing = FALSE, term.map = NULL, |
|
109 |
dim.cutoff = .5, keep.dim = FALSE, use.scan = FALSE, dir = getOption("lingmatch.lspace.dir")) { |
|
110 | 45x |
if (ckd <- dir == "") dir <- "~/Latent Semantic Spaces" |
111 | 45x |
if ((is.character(dtm) || is.factor(dtm)) && missing(space)) { |
112 | 2x |
if (length(dtm) > 1 && any(grepl(" ", dtm, fixed = TRUE))) { |
113 | ! |
dtm <- lma_dtm(dtm) |
114 | 2x |
} else if (length(dtm) == 1 && dtm != "") { |
115 | 2x |
if (missing(use.scan)) use.scan <- TRUE |
116 | 2x |
space <- dtm |
117 | 2x |
dtm <- "" |
118 |
} |
|
119 |
} |
|
120 | 4x |
if (is.data.frame(dtm)) dtm <- as.matrix(dtm) |
121 | 45x |
if (missing(space)) { |
122 | 11x |
nr <- nrow(dtm) |
123 | ! |
if (is.null(nr)) stop("enter a matrix for dtm, or specify a space") |
124 | 11x |
s <- svd(dtm) |
125 | 11x |
s$v <- t(s$v) |
126 | 11x |
k <- cumsum(s$d) / sum(s$d) |
127 | ! |
if (dim.cutoff > 1) dim.cutoff <- 1 |
128 | 11x |
k <- if (length(k) == 1) 1 else seq_len(if (any(k < dim.cutoff)) which(k >= dim.cutoff)[1] else 1) |
129 | 11x |
if (keep.dim) { |
130 | 4x |
dtm[] <- s$u[, k] %*% (if (length(k) == 1) matrix(s$d[k]) else diag(s$d[k])) %*% s$v[k, ] |
131 |
} else { |
|
132 | 7x |
cn <- colnames(dtm) |
133 | 7x |
dtm <- t(s$v[k, , drop = FALSE]) |
134 | 7x |
rownames(dtm) <- cn |
135 |
} |
|
136 |
} else { |
|
137 | 34x |
terms <- if (is.null(colnames(dtm))) { |
138 | 20x |
map.space <- FALSE |
139 | 20x |
dtm |
140 |
} else { |
|
141 | 14x |
colnames(dtm) |
142 |
} |
|
143 | 34x |
if (is.character(space)) { |
144 | ! |
if (space == "default") space <- "100k_lsa" |
145 | 22x |
name <- gsub("^.*[/\\]|\\.[^/\\]*$", "", space)[1] |
146 | 22x |
spaces <- list.files(dir) |
147 | 22x |
ts <- grep(space, spaces, fixed = TRUE, value = TRUE) |
148 | 22x |
if (!length(ts)) { |
149 | ! |
ts <- rownames(select.lspace(name)$selected)[1] |
150 | ! |
if (!ckd && !is.na(ts) && grepl("^$|^[yt1]|^ent", readline(paste0( |
151 | ! |
"would you like to download the ", ts, " space? (press Enter for yes): " |
152 |
)))) { |
|
153 | ! |
download.lspace(ts, dir = dir) |
154 | ! |
ts <- paste0(ts, ".dat") |
155 |
} else { |
|
156 | ! |
stop("space (", space, ") not found in dir (", dir, ")", |
157 | ! |
if (ckd) '\nspecify a directory (e.g., dir = "~") to locate or download; see ?download.lspace', |
158 | ! |
call. = FALSE |
159 |
) |
|
160 |
} |
|
161 |
} |
|
162 | 22x |
space_path <- normalizePath(paste0(dir, "/", if (length(su <- grep("\\.dat$", ts))) { |
163 | 22x |
ts[su[[1]]] |
164 |
} else { |
|
165 | ! |
use.scan <- TRUE |
166 | ! |
ts[grep("[bgx]z[ip2]*$", ts)[[1]]] |
167 | 22x |
}), "/", FALSE) |
168 | 22x |
name <- sub("\\.[^.]*$", "", basename(space_path)) |
169 | 1x |
if (name %in% colnames(term.map)) term.map <- term.map[term.map[, name] != 0, name] |
170 | 22x |
rex <- function(inds, f) { |
171 | 4x |
nc <- length(strsplit(readLines(f, 1), "\\s+")[[1]]) |
172 | 4x |
l <- length(inds) |
173 | 4x |
all <- all(seq_len(l) == inds) |
174 | 4x |
r <- matrix(0, l, nc) |
175 | 4x |
i <- 1 |
176 | 4x |
con <- file(f, "r") |
177 | 4x |
on.exit(close(con)) |
178 | 4x |
while (i <= l) { |
179 | 821x |
if (all) { |
180 | 2x |
n <- l |
181 |
} else { |
|
182 | 819x |
n <- 1 |
183 | 819x |
while (i + n < l && inds[i + n - 1] == inds[i + n] - 1) n <- n + 1 |
184 |
} |
|
185 | 821x |
r[seq_len(n) + i - 1, ] <- matrix(scan( |
186 | 821x |
con, |
187 | 821x |
n = n * nc, quiet = TRUE, skip = (if (i == 1) { |
188 | 4x |
inds[i] |
189 |
} else { |
|
190 | 817x |
inds[i] - inds[i - 1] |
191 | 821x |
}) - 1, quote = "", comment.char = "", na.strings = "" |
192 | 821x |
), n, nc, byrow = TRUE) |
193 | 821x |
i <- i + n |
194 |
} |
|
195 | 4x |
r |
196 |
} |
|
197 | 22x |
if (is.null(term.map)) { |
198 | 21x |
terms_file <- paste0(dir, "/", name, "_terms.txt") |
199 | ! |
if (!file.exists(terms_file)) stop("terms file (", space, "_terms.txt) not found in dir (", dir, ").") |
200 | 21x |
space_terms <- readLines(terms_file) |
201 | 21x |
su <- if (length(terms) == 1 && terms == "") { |
202 | 2x |
terms <- space_terms |
203 | 2x |
!logical(length(space_terms)) |
204 |
} else { |
|
205 | 19x |
space_terms %in% terms |
206 |
} |
|
207 | 21x |
if (sum(su) < length(terms)) { |
208 | 8x |
lsterms <- tolower(space_terms) |
209 | 8x |
su2 <- !duplicated(lsterms) & lsterms %in% terms[!terms %in% space_terms[su]] |
210 | 8x |
if (any(su2)) { |
211 | ! |
space_terms[su2] <- lsterms[su2] |
212 | ! |
su <- su | su2 |
213 |
} |
|
214 |
} |
|
215 | 21x |
if (sum(su)) { |
216 | 21x |
space <- if (use.scan) rex(which(su), space_path) else t(extract_indices(which(su), space_path)) |
217 | 21x |
rownames(space) <- space_terms[su] |
218 | 21x |
ts <- terms[terms %in% rownames(space)] |
219 | 21x |
space <- space[ts, , drop = FALSE] |
220 |
} else { |
|
221 | ! |
stop("no matching terms in space ", space) |
222 |
} |
|
223 |
} else { |
|
224 | ! |
if (is.character(term.map)) term.map <- structure(seq_along(term.map), names = term.map) |
225 | 1x |
su <- which(names(term.map) %in% terms) |
226 | 1x |
inds <- as.numeric(sort(if (length(terms) == 1 && terms == "") term.map else term.map[su])) |
227 | 1x |
if (length(inds)) { |
228 | 1x |
space <- if (use.scan) rex(inds, space_path) else t(extract_indices(inds, space_path)) |
229 | 1x |
rownames(space) <- ts <- names(term.map)[inds] |
230 |
} else { |
|
231 | ! |
stop("no matching terms in space ", space) |
232 |
} |
|
233 |
} |
|
234 |
} else { |
|
235 | ! |
if (is.data.frame(space)) space <- as.matrix(space) |
236 | 12x |
name <- deparse(substitute(space)) |
237 | 12x |
su <- terms %in% rownames(space) |
238 | 12x |
if (sum(su)) { |
239 | 12x |
ts <- terms[su] |
240 | 12x |
space <- space[ts, , drop = FALSE] |
241 | ! |
} else if (sum(su <- terms %in% colnames(space))) { |
242 | ! |
ts <- terms[su] |
243 | ! |
space <- t(space[, ts, drop = FALSE]) |
244 |
} else { |
|
245 | ! |
stop("no matching terms in provided space") |
246 |
} |
|
247 |
} |
|
248 | 34x |
if (fill.missing) { |
249 | 3x |
su <- which(!terms %in% rownames(space)) |
250 | 3x |
if (length(su)) { |
251 | 3x |
space <- rbind(space, matrix(0, length(su), ncol(space), dimnames = list(terms[su]))) |
252 | 3x |
space <- space[terms, ] |
253 |
} |
|
254 | 3x |
ts <- rownames(space) |
255 |
} |
|
256 | 34x |
attr(space, "space") <- name |
257 | 34x |
if (map.space) { |
258 | 14x |
rep <- length(ts) / ncol(dtm) |
259 | 14x |
if (rep < .2) { |
260 | ! |
warning(paste0( |
261 | ! |
"only ", round(rep * 100, 2), "% of dtm terms appear in the provided space; ", |
262 | ! |
"you might consider using a different source or cleaning/partial matching terms" |
263 | ! |
), call. = FALSE) |
264 |
} |
|
265 | 14x |
dtm <- dtm[, ts, drop = FALSE] %*% space |
266 | 14x |
attr(dtm, "space") <- name |
267 |
} else { |
|
268 | 20x |
return(space) |
269 |
} |
|
270 |
} |
|
271 | 25x |
dtm |
272 |
} |
1 |
#' Assess Dictionary Categories Within a Latent Semantic Space |
|
2 |
#' |
|
3 |
#' |
|
4 |
#' @param dict A vector of terms, list of such vectors, or a matrix-like object to be |
|
5 |
#' categorized by \code{\link{read.dic}}. |
|
6 |
#' @param space A vector space used to calculate similarities between terms. |
|
7 |
#' Names of spaces (see \code{\link{select.lspace}}), a matrix with terms as row names, or |
|
8 |
#' \code{"auto"} to auto-select a space based on matched terms. This can also be \code{multi} |
|
9 |
#' to use multiple spaces, which are combined after similarities are calculated. |
|
10 |
#' @param n_spaces Number of spaces to draw from if \code{space} is \code{multi}. |
|
11 |
#' @param suggest Logical; if \code{TRUE}, will search for other terms for possible inclusion |
|
12 |
#' in \code{space}. |
|
13 |
#' @param suggestion_terms Number of terms to use when selecting suggested additions. |
|
14 |
#' @param suggest_stopwords Logical; if \code{TRUE}, will suggest function words. |
|
15 |
#' @param suggest_discriminate Logical; if \code{TRUE}, will adjust for similarity to other |
|
16 |
#' categories when finding suggestions. |
|
17 |
#' @param expand_cutoff_freq Proportion of mapped terms to include when expanding dictionary terms. |
|
18 |
#' Applies when \code{space} is a character (referring to a space to be loaded). |
|
19 |
#' @param expand_cutoff_spaces Number of spaces in which a term has to appear to be considered |
|
20 |
#' for expansion. Applies when \code{space} is a character (referring to a space to be loaded). |
|
21 |
#' @param dimension_prop Proportion of dimensions to use when searching for suggested additions, |
|
22 |
#' where less than 1 will calculate similarities to the category core using fewer dimensions |
|
23 |
#' of the space. |
|
24 |
#' @param pairwise Logical; if \code{FALSE}, will compare candidate suggestion terms with a single, |
|
25 |
#' averaged category vector rather than all category terms separately. |
|
26 |
#' @param glob Logical; if \code{TRUE}, converts globs (asterisk wildcards) to regular expressions. |
|
27 |
#' @param space_dir Directory from which \code{space} should be loaded. |
|
28 |
#' @param verbose Logical; if \code{FALSE}, will not show status messages. |
|
29 |
#' @family Dictionary functions |
|
30 |
#' @seealso |
|
31 |
#' To just expand fuzzy terms, see \code{\link{report_term_matches}()}. |
|
32 |
#' |
|
33 |
#' Similar information is provided in the \href{https://miserman.github.io/dictionary_builder/}{dictionary builder} web tool. |
|
34 |
#' @returns A list: |
|
35 |
#' \itemize{ |
|
36 |
#' \item \strong{\code{expanded}}: A version of \code{dict} with fuzzy terms expanded. |
|
37 |
#' \item \strong{\code{summary}}: A summary of each dictionary category. |
|
38 |
#' \item \strong{\code{terms}}: Match (expanded term) similarities within terms and categories. |
|
39 |
#' \item \strong{\code{suggested}}: If \code{suggest} is \code{TRUE}, a list with suggested |
|
40 |
#' additions for each dictionary category. Each entry is a named numeric vector with |
|
41 |
#' similarities for each suggested term. |
|
42 |
#' } |
|
43 |
#' @examples |
|
44 |
#' if (dir.exists("~/Latent Semantic Spaces")) { |
|
45 |
#' dict <- list( |
|
46 |
#' furniture = c("table", "chair", "desk*", "couch*", "sofa*"), |
|
47 |
#' well_adjusted = c("happy", "bright*", "friend*", "she", "he", "they") |
|
48 |
#' ) |
|
49 |
#' dictionary_meta(dict, space_dir = "~/Latent Semantic Spaces") |
|
50 |
#' } |
|
51 |
#' @export |
|
52 | ||
53 |
dictionary_meta <- function( |
|
54 |
dict, space = "auto", n_spaces = 5, suggest = FALSE, suggestion_terms = 10, suggest_stopwords = FALSE, |
|
55 |
suggest_discriminate = TRUE, expand_cutoff_freq = .98, expand_cutoff_spaces = 10, |
|
56 |
dimension_prop = 1, pairwise = TRUE, glob = TRUE, space_dir = getOption("lingmatch.lspace.dir"), verbose = TRUE) { |
|
57 | ! |
if (missing(dict)) stop("dict must be specified", call. = FALSE) |
58 | 2x |
if (!is.list(dict)) dict <- list(dict) |
59 | 2x |
if (is.null(names(dict))) names(dict) <- paste0("cat", seq_along(dict)) |
60 | 6x |
st <- proc.time()[[3]] |
61 | 6x |
if (verbose) cat("preparing terms (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
62 | 6x |
terms <- data.frame( |
63 | 6x |
category = rep(names(dict), vapply(dict, length, 0)), term = unlist(dict), stringsAsFactors = FALSE |
64 |
) |
|
65 | 6x |
rownames(terms) <- NULL |
66 | 6x |
terms$regex <- paste0("\\b", to_regex(list(terms$term), TRUE, glob)[[1]], "\\b") |
67 | 6x |
if (is.character(space)) { |
68 | 3x |
term_map <- select.lspace(dir = space_dir, get.map = TRUE)$term_map |
69 | 3x |
if (is.null(term_map)) { |
70 | ! |
stop( |
71 | ! |
"term map not found; specify `space_dir` or provide text", |
72 | ! |
call. = FALSE |
73 |
) |
|
74 |
} |
|
75 | 3x |
if (expand_cutoff_freq > 0 && expand_cutoff_freq < 1) { |
76 | 3x |
term_map <- term_map[seq(1, ceiling(nrow(term_map) * expand_cutoff_freq)), , drop = FALSE] |
77 |
} |
|
78 | 3x |
if (expand_cutoff_spaces > 0 && expand_cutoff_spaces < ncol(term_map)) { |
79 | 3x |
term_map <- term_map[rowSums(term_map != 0) >= expand_cutoff_spaces, , drop = FALSE] |
80 |
} |
|
81 | 3x |
if (!grepl("^(?:auto|multi)", space[[1]], TRUE)) { |
82 | 1x |
space <- space[space %in% colnames(term_map)] |
83 | ! |
if (!length(space)) stop("`space` not found in `term_map` colnames", call. = FALSE) |
84 | 1x |
term_map <- term_map[rowSums(term_map[, space, drop = FALSE] != 0) != 0, , drop = FALSE] |
85 |
} |
|
86 | 3x |
space_terms <- rownames(term_map) |
87 |
} else { |
|
88 | 3x |
space_terms <- rownames(space) |
89 | 3x |
if (is.null(space_terms)) { |
90 | ! |
stop( |
91 | ! |
"for space, enter a name or matrix-like object with terms as rownames", |
92 | ! |
call. = FALSE |
93 |
) |
|
94 |
} |
|
95 |
} |
|
96 | 6x |
if (verbose) cat("expanding terms (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
97 | 6x |
matches <- extract_matches(terms$regex, paste(space_terms, collapse = " "), TRUE) |
98 | 6x |
matched_terms <- unique(unlist(lapply(matches, names), use.names = FALSE)) |
99 | 1x |
if (!length(matched_terms)) stop("no `dict` terms matched any space terms", call. = FALSE) |
100 | 5x |
multi <- FALSE |
101 | 5x |
if (is.character(space)) { |
102 | ! |
if (length(space) == 1 && !missing(n_spaces) && n_spaces > 1) space <- "multi" |
103 | 3x |
multi <- grepl("^multi", space[[1]], TRUE) |
104 | 3x |
if (length(space) > 1 || multi) { |
105 | 2x |
if (length(space) == 1 && multi) { |
106 | 1x |
term_map_matched <- term_map[rownames(term_map) %in% matched_terms, , drop = FALSE] |
107 | 1x |
commonness <- sort(-rowSums(term_map_matched != 0)) |
108 | 1x |
common_terms <- names(commonness[commonness >= min(max(commonness), n_spaces)]) |
109 | 1x |
space <- names(sort(-colSums(term_map_matched[common_terms, , drop = FALSE])))[ |
110 | 1x |
seq_len(max(1, min(nrow(term_map), n_spaces))) |
111 |
] |
|
112 |
} |
|
113 | 2x |
multi <- TRUE |
114 | 2x |
term_map <- term_map[, space, drop = FALSE] |
115 | 2x |
space_terms <- rownames(term_map)[rowSums(term_map != 0) == length(space)] |
116 | 2x |
space_name <- paste(space, collapse = ", ") |
117 | 2x |
if (verbose) cat("loading spaces (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
118 | 2x |
space <- lapply(space, function(s) { |
119 | 7x |
lma_lspace( |
120 | 7x |
if (suggest) space_terms else matched_terms[matched_terms %in% space_terms], s, |
121 | 7x |
dir = space_dir |
122 |
) |
|
123 |
}) |
|
124 | 2x |
matches <- lapply(matches, function(l) l[names(l) %in% space_terms]) |
125 | 2x |
matched_terms <- unique(unlist(lapply(matches, names), use.names = FALSE)) |
126 |
} else { |
|
127 | 1x |
if (space == "auto") { |
128 | 1x |
space <- colnames(term_map)[ |
129 | 1x |
which.max(colSums(term_map[rownames(term_map) %in% matched_terms, ] != 0)) |
130 |
] |
|
131 |
} |
|
132 | 1x |
space_name <- space |
133 | 1x |
space_terms <- rownames(term_map)[term_map[, space] != 0] |
134 | 1x |
if (verbose) cat("loading space (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
135 | 1x |
space <- lma_lspace( |
136 | 1x |
if (suggest) space_terms else matched_terms[matched_terms %in% space_terms], space, |
137 | 1x |
dir = space_dir |
138 |
) |
|
139 | 1x |
matches <- lapply(matches, function(l) l[names(l) %in% space_terms]) |
140 | 1x |
matched_terms <- unique(unlist(lapply(matches, names), use.names = FALSE)) |
141 |
} |
|
142 |
} else { |
|
143 | ! |
if (is.data.frame(space)) space <- as.matrix(space) |
144 | 2x |
space_name <- "custom" |
145 |
} |
|
146 | 5x |
cat_names <- structure(names(dict), names = names(dict)) |
147 | 5x |
dict_exp <- lapply(cat_names, function(cat) { |
148 | 17x |
unique(names(unlist(matches[terms$category == cat]))) |
149 |
}) |
|
150 | 5x |
if (multi) { |
151 | 1x |
if (!suggest) space_terms <- rownames(space[[1]]) |
152 | 2x |
if (verbose) cat("calculating term similarities (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
153 | 2x |
sims <- lapply(cat_names, function(cat) { |
154 | 8x |
su <- space_terms %in% dict_exp[[cat]] |
155 | 8x |
if (any(su)) { |
156 | 6x |
aggsim <- NULL |
157 | 6x |
for (i in seq_along(space)) { |
158 | 21x |
s <- space[[i]] |
159 | 21x |
if (dimension_prop < 1) { |
160 | 15x |
loadings <- colMeans(s[su, , drop = FALSE]) |
161 | 15x |
dsu <- order(-loadings)[seq(1, max(1, ceiling(ncol(s) * dimension_prop)))] |
162 | 15x |
s <- s[, dsu, drop = FALSE] |
163 |
} |
|
164 | 21x |
if (pairwise) { |
165 | 6x |
sim <- lma_simets(s, s[su, ], metric = "cosine", pairwise = TRUE, symmetrical = TRUE) |
166 | 4x |
if (is.null(dim(sim))) sim <- as(t(t(sim)), "CsparseMatrix") |
167 | 6x |
diag(sim[su, , drop = FALSE]) <- 0 |
168 | 6x |
ms <- min(sim) |
169 | 6x |
sim <- (sim - ms) / (max(sim) - ms) * sign(sim) |
170 | 6x |
diag(sim[su, , drop = FALSE]) <- 1 |
171 |
} else { |
|
172 | 15x |
sim <- lma_simets(s, colMeans(s[su, , drop = FALSE]), metric = "cosine", pairwise = TRUE, symmetrical = TRUE) |
173 |
} |
|
174 | 21x |
if (i == 1) { |
175 | 6x |
aggsim <- sim |
176 |
} else { |
|
177 | 15x |
aggsim <- aggsim + sim |
178 |
} |
|
179 |
} |
|
180 | 3x |
if (is.null(dim(aggsim))) aggsim <- as(t(t(aggsim)), "CsparseMatrix") |
181 | 6x |
aggsim / length(space) |
182 |
} |
|
183 |
}) |
|
184 |
} else { |
|
185 | 3x |
space <- as(space, "CsparseMatrix") |
186 | ! |
if (!suggest) space_terms <- rownames(space) |
187 | 3x |
if (verbose) cat("calculating term similarities (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
188 | 3x |
sims <- lapply(cat_names, function(cat) { |
189 | 9x |
su <- space_terms %in% dict_exp[[cat]] |
190 | 9x |
if (any(su)) { |
191 | 7x |
if (dimension_prop < 1) { |
192 | 3x |
loadings <- colSums(space[su, , drop = FALSE]) |
193 | 3x |
dsu <- order(loadings, decreasing = TRUE)[seq(1, max(1, ceiling(ncol(space) * dimension_prop)))] |
194 | 3x |
space <- space[, dsu, drop = FALSE] |
195 |
} |
|
196 | 7x |
sim <- lma_simets( |
197 | 7x |
space, if (pairwise) space[su, , drop = FALSE] else colMeans(space[su, , drop = FALSE]), |
198 | 7x |
metric = "cosine", pairwise = TRUE, symmetrical = TRUE |
199 |
) |
|
200 | 3x |
if (is.null(dim(sim))) sim <- as(t(t(sim)), "CsparseMatrix") |
201 | 7x |
sim |
202 |
} |
|
203 |
}) |
|
204 |
} |
|
205 | 5x |
if (suggest) { |
206 | 4x |
if (verbose) cat("identifying potential additions (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
207 | 4x |
if (!suggest_stopwords) is_stop <- lma_dict(as.function = TRUE) |
208 | 4x |
full_loadings <- do.call(cbind, lapply(sims, function(x) { |
209 | 13x |
if (length(x)) { |
210 | 10x |
agg <- rowMeans(x) |
211 | 10x |
agg[agg < 0] <- 0 |
212 | 10x |
agg |
213 |
} else { |
|
214 | 3x |
structure(numeric(length(space_terms)), names = space_terms) |
215 |
} |
|
216 |
})) |
|
217 | 4x |
loading_cat <- names(cat_names)[max.col(full_loadings)] |
218 | 4x |
suggested <- lapply(cat_names, function(cat) { |
219 | 13x |
s <- sims[[cat]] |
220 | 13x |
if (length(s)) { |
221 | 10x |
su <- !rownames(s) %in% dict_exp[[cat]] & loading_cat == cat |
222 | 10x |
loadings <- sort(if (suggest_discriminate && ncol(full_loadings) > 1) { |
223 | 9x |
nl <- full_loadings[su, colnames(full_loadings) != cat, drop = FALSE] |
224 | 9x |
(rowMeans(s[su, , drop = FALSE]) - nl[ |
225 | 9x |
rep(seq_len(ncol(nl)), each = nrow(nl)) == max.col(nl) |
226 | 9x |
]) / 2 |
227 |
} else { |
|
228 | 1x |
rowMeans(s[su, , drop = FALSE]) |
229 | 10x |
}, TRUE) |
230 | 10x |
if (length(loadings)) { |
231 | 10x |
if (!suggest_stopwords) loadings <- loadings[!is_stop(names(loadings))] |
232 | 10x |
co <- min(length(loadings), max(which(loadings > 0)), suggestion_terms) |
233 | 10x |
loadings[loadings > loadings[co] + Reduce("-", range(loadings[seq(1, co)])) / 2] |
234 |
} |
|
235 |
} |
|
236 |
}) |
|
237 |
} |
|
238 | 5x |
if (verbose) cat("preparing results (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
239 | 5x |
match_counts <- vapply(matches, length, 0) |
240 | 5x |
term_summary <- data.frame( |
241 | 5x |
terms[rep(seq_len(nrow(terms)), match_counts), c("category", "term")], |
242 | 5x |
match = unlist(lapply(matches, names)), |
243 | 5x |
stringsAsFactors = FALSE |
244 |
) |
|
245 | 5x |
term_summary <- cbind(term_summary, do.call(rbind, lapply( |
246 | 5x |
split( |
247 | 5x |
term_summary[, c("category", "match", "term")], |
248 | 5x |
term_summary$category |
249 | 5x |
)[unique(term_summary$category)], |
250 | 5x |
function(cl) { |
251 | 13x |
cat <- cl$category[[1]] |
252 | 13x |
if (pairwise) { |
253 | 10x |
s <- sims[[cat]] |
254 |
} else { |
|
255 | 3x |
su <- space_terms %in% dict_exp[[cat]] |
256 | 3x |
if (multi) { |
257 | 3x |
aggsim <- NULL |
258 | 3x |
for (i in seq_along(space)) { |
259 | 15x |
s <- space[[i]] |
260 | 15x |
if (sum(su) == 1) { |
261 | 5x |
sim <- Matrix(1, 1, dimnames = as.list(rep(rownames(s)[su], 2)), sparse = TRUE) |
262 |
} else { |
|
263 | 10x |
sim <- lma_simets(s[su, , drop = FALSE], metric = "cosine", pairwise = TRUE, symmetrical = TRUE) |
264 | ! |
if (is.null(dim(sim))) sim <- as(t(t(sim)), "CsparseMatrix") |
265 | 10x |
diag(sim) <- 0 |
266 | 10x |
ms <- min(sim) |
267 | 10x |
sim <- (sim - ms) / (max(sim) - ms) * sign(sim) |
268 | 10x |
diag(sim) <- 1 |
269 |
} |
|
270 | 15x |
if (i == 1) { |
271 | 3x |
aggsim <- sim |
272 |
} else { |
|
273 | 12x |
aggsim <- aggsim + sim |
274 |
} |
|
275 |
} |
|
276 | 3x |
s <- aggsim / length(space) |
277 |
} else { |
|
278 | ! |
s <- lma_simets(space[su, , drop = FALSE], metric = "cosine", pairwise = TRUE, symmetrical = TRUE) |
279 |
} |
|
280 |
} |
|
281 | 13x |
if (is.null(s)) { |
282 | ! |
cbind(sim.term = cl$match, sim.category = 0) |
283 |
} else { |
|
284 | 13x |
su <- !(cl$match %in% rownames(s)) |
285 | 13x |
if (any(su)) { |
286 | ! |
s <- rbind(s, Matrix( |
287 | ! |
0, sum(su), ncol(s), |
288 | ! |
dimnames = list(cl$match[su], colnames(s)), sparse = TRUE |
289 |
)) |
|
290 |
} |
|
291 | 13x |
term_sims <- unlist(lapply(unname(split(cl$match, cl$term)[unique(cl$term)]), function(l) { |
292 | 24x |
if (length(l) == 1) { |
293 | 20x |
structure(1, names = l) |
294 |
} else { |
|
295 | 4x |
cols <- l[l %in% colnames(s)] |
296 | 4x |
s[l, cols[which.min(nchar(cols))]] |
297 |
} |
|
298 |
})) |
|
299 | 13x |
cat_sims <- s[cl$match, which.max(if (is.null(dim(s))) { |
300 | ! |
s |
301 | 13x |
} else if (ncol(s) == 1) { |
302 | 6x |
1 |
303 |
} else { |
|
304 | 7x |
colMeans(s[colnames(s), , drop = FALSE]) |
305 |
})] |
|
306 | 13x |
cbind(sim.term = term_sims, sim.category = if (is.null(cat_sims)) 0 else cat_sims) |
307 |
} |
|
308 |
} |
|
309 |
))) |
|
310 | 5x |
summary <- cbind(data.frame( |
311 | 5x |
category = cat_names, |
312 | 5x |
n_terms = vapply(dict, length, 0), |
313 | 5x |
n_expanded = tapply(match_counts, terms$category, sum)[cat_names], |
314 | 5x |
sim.space = space_name, |
315 | 5x |
stringsAsFactors = FALSE |
316 | 5x |
), sim = do.call(rbind, lapply(sims, function(s) { |
317 | 17x |
if (length(s) && !is.null(ncol(s)) && ncol(s) == 1) { |
318 | 8x |
m <- s[matched_terms, 1] |
319 | 9x |
} else if (length(s)) { |
320 | 5x |
s <- s[colnames(s), , drop = FALSE] |
321 | 5x |
m <- (colSums(s) - 1) / (ncol(s) - 1) |
322 |
} else { |
|
323 | 4x |
m <- 0 |
324 |
} |
|
325 | 17x |
structure(summary(m), names = c("min", "q1", "median", "mean", "q3", "max")) |
326 |
}))) |
|
327 | 5x |
if (verbose) cat("done (", round(proc.time()[[3]] - st, 4), ")\n", sep = "") |
328 | 5x |
list(expanded = dict_exp, summary = summary, terms = term_summary, suggested = if (suggest) suggested) |
329 |
} |
1 |
#' Document-Term Matrix Creation |
|
2 |
#' |
|
3 |
#' Creates a document-term matrix (dtm) from a set of texts. |
|
4 |
#' @param text Texts to be processed. This can be a vector (such as a column in a data frame) |
|
5 |
#' or list. When a list, these can be in the form returned with \code{tokens.only = TRUE}, |
|
6 |
#' or a list with named vectors, where names are tokens and values are frequencies or the like. |
|
7 |
#' @param exclude A character vector of words to be excluded. If \code{exclude} is a single string |
|
8 |
#' matching \code{'function'}, \code{lma_dict(1:9)} will be used. |
|
9 |
#' @param context A character vector used to reformat text based on look- ahead/behind. For example, |
|
10 |
#' you might attempt to disambiguate \emph{like} by reformatting certain \emph{like}s |
|
11 |
#' (e.g., \code{context = c('(i) like*', '(you) like*', '(do) like')}, where words in parentheses are |
|
12 |
#' the context for the target word, and asterisks denote partial matching). This would be converted |
|
13 |
#' to regular expression (i.e., \code{'(? <= i) like\\\\b'}) which, if matched, would be |
|
14 |
#' replaced with a coded version of the word (e.g., \code{"Hey, i like that!"} would become |
|
15 |
#' \code{"Hey, i i-like that!"}). This would probably only be useful for categorization, where a |
|
16 |
#' dictionary would only include one or another version of a word (e.g., the LIWC 2015 dictionary |
|
17 |
#' does something like this with \emph{like}, and LIWC 2007 did something like this with |
|
18 |
#' \emph{kind (of)}, both to try and clean up the posemo category). |
|
19 |
#' @param replace.special Logical: if \code{TRUE}, special characters are replaced with regular |
|
20 |
#' equivalents using the \code{\link{lma_dict}} special function. |
|
21 |
#' @param numbers Logical: if \code{TRUE}, numbers are preserved. |
|
22 |
#' @param punct Logical: if \code{TRUE}, punctuation is preserved. |
|
23 |
#' @param urls Logical: if \code{FALSE}, attempts to replace all urls with "repurl". |
|
24 |
#' @param emojis Logical: if \code{TRUE}, attempts to replace emojis (e.g., ":(" would be replaced |
|
25 |
#' with "repfrown"). |
|
26 |
#' @param to.lower Logical: if \code{FALSE}, words with different capitalization are treated as |
|
27 |
#' different terms. |
|
28 |
#' @param word.break A regular expression string determining the way words are split. Default is |
|
29 |
#' \code{' +'} which breaks words at one or more blank spaces. You may also like to break by |
|
30 |
#' dashes or slashes (\code{'[ /-]+'}), depending on the text. |
|
31 |
#' @param dc.min Numeric: excludes terms appearing in the set number or fewer documents. |
|
32 |
#' Default is 0 (no limit). |
|
33 |
#' @param dc.max Numeric: excludes terms appearing in the set number or more. Default |
|
34 |
#' is Inf (no limit). |
|
35 |
#' @param sparse Logical: if \code{FALSE}, a regular dense matrix is returned. |
|
36 |
#' @param tokens.only Logical: if \code{TRUE}, returns a list rather than a matrix, with these entries: |
|
37 |
#' \tabular{ll}{ |
|
38 |
#' \code{tokens} \tab A vector of indices with terms as names. \cr |
|
39 |
#' \code{frequencies} \tab A vector of counts with terms as names. \cr |
|
40 |
#' \code{WC} \tab A vector of term counts for each document. \cr |
|
41 |
#' \code{indices} \tab A list with a vector of token indices for each document. \cr |
|
42 |
#' } |
|
43 |
#' @note |
|
44 |
#' This is a relatively simple way to make a dtm. To calculate the (more or less) standard forms of |
|
45 |
#' LSM and LSS, a somewhat raw dtm should be fine, because both processes essentially use |
|
46 |
#' dictionaries (obviating stemming) and weighting or categorization (largely obviating 'stop word' |
|
47 |
#' removal). The exact effect of additional processing will depend on the dictionary/semantic space |
|
48 |
#' and weighting scheme used (particularly for LSA). This function also does some processing which |
|
49 |
#' may matter if you plan on categorizing with categories that have terms with look- ahead/behind assertions |
|
50 |
#' (like LIWC dictionaries). Otherwise, other methods may be faster, more memory efficient, and/or more featureful. |
|
51 |
#' @return A sparse matrix (or regular matrix if \code{sparse = FALSE}), with a row per \code{text}, |
|
52 |
#' and column per term, or a list if \code{tokens.only = TRUE}. Includes an attribute with options (\code{opts}), |
|
53 |
#' and attributes with word count (\code{WC}) and column sums (\code{colsums}) if \code{tokens.only = FALSE}. |
|
54 |
#' @examples |
|
55 |
#' text <- c( |
|
56 |
#' "Why, hello there! How are you this evening?", |
|
57 |
#' "I am well, thank you for your inquiry!", |
|
58 |
#' "You are a most good at social interactions person!", |
|
59 |
#' "Why, thank you! You're not all bad yourself!" |
|
60 |
#' ) |
|
61 |
#' |
|
62 |
#' lma_dtm(text) |
|
63 |
#' |
|
64 |
#' # return tokens only |
|
65 |
#' (tokens <- lma_dtm(text, tokens.only = TRUE)) |
|
66 |
#' |
|
67 |
#' ## convert those to a regular DTM |
|
68 |
#' lma_dtm(tokens) |
|
69 |
#' |
|
70 |
#' # convert a list-representation to a sparse matrix |
|
71 |
#' lma_dtm(list( |
|
72 |
#' doc1 = c(why = 1, hello = 1, there = 1), |
|
73 |
#' doc2 = c(i = 1, am = 1, well = 1) |
|
74 |
#' )) |
|
75 |
#' @export |
|
76 | ||
77 |
lma_dtm <- function(text, exclude = NULL, context = NULL, replace.special = FALSE, numbers = FALSE, |
|
78 |
punct = FALSE, urls = TRUE, emojis = FALSE, to.lower = TRUE, word.break = " +", dc.min = 0, |
|
79 |
dc.max = Inf, sparse = TRUE, tokens.only = FALSE) { |
|
80 | 80x |
if (!is.null(dim(text))) { |
81 | ! |
if (is.character(text[, 1]) || is.factor(text[, 1])) { |
82 | ! |
text <- text[, 1] |
83 |
} else { |
|
84 | ! |
stop("enter a vector of texts as the first argument") |
85 |
} |
|
86 |
} |
|
87 | 80x |
if (is.list(text)) { |
88 | 4x |
if (all(c("tokens", "indices") %in% names(text))) { |
89 | 2x |
m <- do.call(rbind, lapply(seq_along(text$indices), function(i) { |
90 | 30x |
if (length(text$indices[[i]])) { |
91 | 28x |
inds <- as.factor(text$indices[[i]]) |
92 | 28x |
cbind(i, as.integer(levels(inds)), tabulate(inds)) |
93 |
} |
|
94 |
})) |
|
95 | 2x |
dtm <- sparseMatrix(m[, 1], m[, 2], |
96 | 2x |
x = m[, 3], dims = c(length(text$indices), length(text$tokens)), |
97 | 2x |
dimnames = list(NULL, if (is.character(text$tokens)) text$tokens else names(text$tokens)) |
98 |
) |
|
99 | ! |
if (!sparse) dtm <- as.matrix(dtm) |
100 | 2x |
attr(dtm, "colsums") <- text$frequencies |
101 | 2x |
attr(dtm, "type") <- "count" |
102 | 2x |
attr(dtm, "WC") <- text$WC |
103 | 2x |
attr(dtm, "opts") <- attr(text, "opts") |
104 | 2x |
attr(dtm, "time") <- attr(text, "time") |
105 | 2x |
return(dtm) |
106 |
} else { |
|
107 | 2x |
tokens <- unlist(unname(text), recursive = FALSE) |
108 | 2x |
cinds <- unique(names(tokens)) |
109 | 2x |
if (is.null(cinds)) { |
110 | 1x |
text <- unlist(text) |
111 |
} else { |
|
112 | 1x |
rinds <- rep(seq_along(text), vapply(text, length, 0)) |
113 | 1x |
cinds <- structure(seq_along(cinds), names = cinds) |
114 | 1x |
dtm <- sparseMatrix( |
115 | 1x |
rinds, cinds[names(tokens)], |
116 | 1x |
x = tokens, |
117 | 1x |
dims = c(length(text), length(cinds)), dimnames = list(names(text), names(cinds)) |
118 |
) |
|
119 | 1x |
if (!sparse) dtm <- as.matrix(dtm) |
120 | 1x |
return(dtm) |
121 |
} |
|
122 |
} |
|
123 |
} |
|
124 | ! |
if (is.null(text)) stop(substitute(text), " not found") |
125 | 77x |
docnames <- names(text) |
126 | 77x |
if (is.character(text) && all(nchar(text) < 500) && all(file.exists(text))) { |
127 | 1x |
text <- if (length(text) != 1 || dir.exists(text)) read.segments(text) else readLines(text) |
128 |
} |
|
129 | 77x |
text <- paste(" ", text, " ") |
130 | 77x |
st <- proc.time()[[3]] |
131 | ! |
if (replace.special) text <- lma_dict("special", as.function = gsub)(text) |
132 | 77x |
text <- gsub("(?<=[^a-z0-9])'|'(?=[^a-z0-9])", '"', text, TRUE, TRUE) |
133 | 77x |
if (!urls) { |
134 | 12x |
text <- gsub(paste0( |
135 | 12x |
"\\s[a-z]+://[^\\s]*|www\\.[^\\s]*|\\s[a-z_~-]+\\.[a-z_~-]{2,}[^\\s]*|\\s[a-z_~-]+\\.", |
136 | 12x |
"(?:io|com|net|org|gov|edu)\\s" |
137 | 12x |
), " repurl ", text, TRUE, TRUE) |
138 | 12x |
text <- gsub("(?<=[A-Z])\\.\\s", " ", text, perl = TRUE) |
139 |
} |
|
140 | 77x |
text <- gsub("\\s+", " ", text) |
141 | 77x |
text <- gsub("\\s(etc|st|rd|ft|feat|dr|drs|mr|ms|mrs|messrs|jr|prof)\\.", " \\1tempperiod", text, TRUE) |
142 | 77x |
text <- gsub("\\s\\.|\\.\\s", " . ", text) |
143 | 77x |
if (any(punct, emojis, !is.null(context))) { |
144 | 22x |
special <- lma_dict(special)[[1]] |
145 | 22x |
if (!missing(context) && length(context) == 1 && grepl("like", context, TRUE)) { |
146 | ! |
context <- special[["LIKE"]] |
147 |
} |
|
148 | 18x |
if (punct) text <- gsub(special[["ELLIPSIS"]], " repellipsis ", text) |
149 | 22x |
if (emojis) { |
150 | 2x |
for (type in c("SMILE", "FROWN")) { |
151 | 4x |
text <- gsub( |
152 | 4x |
special[[type]], paste0(" rep", tolower(type), " "), text, |
153 | 4x |
perl = TRUE |
154 |
) |
|
155 |
} |
|
156 |
} |
|
157 | 22x |
if (!missing(context)) { |
158 | 2x |
if (!any(grepl("[?=]", context))) { |
159 | 1x |
context <- gsub("^\\(", "(?<=", context) |
160 | 1x |
context <- gsub("\\((?!\\?)", "(?=", context, perl = TRUE) |
161 | 1x |
context <- gsub("(?<![)*])$", "\\\\b", context, perl = TRUE) |
162 | 1x |
context <- gsub("\\*", "\\\\w*", context, perl = TRUE) |
163 |
} |
|
164 | 2x |
context <- structure( |
165 | 2x |
as.list(context), |
166 | 2x |
names = paste("", gsub("--+", "-", gsub("[\\s^[]|\\\\s]", "-", |
167 | 2x |
gsub("[^a-z0-9\\s\\\\']|\\\\[wbs]", "", context, TRUE, TRUE), |
168 | 2x |
perl = TRUE |
169 |
)), "") |
|
170 |
) |
|
171 | 2x |
for (rn in names(context)) text <- gsub(context[[rn]], rn, text, perl = TRUE) |
172 |
} |
|
173 |
} |
|
174 | 73x |
if (to.lower) text <- tolower(text) |
175 | 77x |
if (!is.null(exclude)) { |
176 | 3x |
if (length(exclude) == 1 && grepl(exclude, "function", TRUE)) { |
177 | 1x |
exclude <- unlist(lma_dict(), use.names = FALSE) |
178 | 2x |
} else if (is.list(exclude)) exclude <- unlist(exclude, use.names = FALSE) |
179 |
} |
|
180 | 64x |
if (!numbers) text <- gsub("[[:punct:]]*[0-9][0-9,.el-]*", " ", text, TRUE, TRUE) |
181 | 77x |
text <- gsub(paste0( |
182 | 77x |
"([^a-z0-9.,':/?=#\\s-]|[:/?=#](?=\\s)|(?:(?<=\\s)[:/=-]|,)(?=[a-z])|(?<=[^a-z0-9])", |
183 | 77x |
"(,(?=[a-z0-9])|[.-](?=[a-z]))|[.,'-](?=[^0-9a-z]|[.,'-]))" |
184 | 77x |
), " \\1 ", text, TRUE, TRUE) |
185 | 77x |
text <- gsub("(\\s[a-z]+)/([a-z]+\\s)", " \\1 / \\2 ", text, TRUE, TRUE) |
186 | 77x |
text <- gsub("([a-z0-9.,'-].*[^a-z0-9])", " \\1 ", text, TRUE, TRUE) |
187 | 77x |
text <- gsub("(?<=[a-z])\\s['\u00E7\u00ED]\\s(?=[a-z])", "'", text, TRUE, TRUE) |
188 | 77x |
if (!punct) { |
189 | 59x |
text <- gsub("[^A-Za-z0-9'._-]+", " ", text) |
190 | 59x |
text <- gsub("(?=[a-z])\\.+|(?<=[^a-z0-9])['._-]+|'+\\s", " ", text, TRUE, TRUE) |
191 |
} |
|
192 | 77x |
text <- gsub("tempperiod", ".", text, fixed = TRUE) |
193 | 77x |
text <- gsub("^\\s+|\\s(?=\\s)|\\s+$", "", text, perl = TRUE) |
194 | 77x |
text <- strsplit(text, word.break) |
195 | 77x |
words <- sort(unique(unlist(text))) |
196 | 77x |
words <- words[!words == ""] |
197 | 77x |
if (!is.null(exclude)) { |
198 | ! |
if (is.list(exclude)) exclude <- unlist(exclude, use.names = FALSE) |
199 | ! |
if (!any(grepl("^", exclude, fixed = TRUE))) exclude <- gsub("\\^\\*|\\*\\$", "", paste0("^", exclude, "$")) |
200 | 3x |
if (any(ck <- grepl("[[({]", exclude) + grepl("[})]|\\]", exclude) == 1)) { |
201 | ! |
exclude[ck] <- gsub("([([{}\\])])", "\\\\\\1", exclude[ck], perl = TRUE) |
202 |
} |
|
203 | 3x |
words <- grep(paste(exclude, collapse = "|"), words, value = TRUE, invert = TRUE) |
204 |
} |
|
205 | 77x |
if (tokens.only) { |
206 | 2x |
m <- match_terms( |
207 | 2x |
text, words, !grepl("^(?:[[:punct:]]|repellipsis)$", words), |
208 | 2x |
c(length(text), length(words)), is.null(exclude), TRUE |
209 |
) |
|
210 | 2x |
names(m) <- c("tokens", "frequencies", "WC", "indices") |
211 | 2x |
m$tokens <- m$tokens + 1L |
212 | 2x |
m$tokens <- sort(m$tokens) |
213 | 2x |
inds <- vector("list", length(text)) |
214 | 2x |
l <- 0 |
215 | 2x |
for (i in seq_along(inds)) { |
216 | 30x |
if (m$WC[i]) { |
217 | 28x |
inds[[i]] <- m$indices[seq_len(m$WC[i]) + l] + 1L |
218 | 28x |
l <- l + m$WC[i] |
219 |
} else { |
|
220 | 2x |
inds[[i]] <- integer() |
221 |
} |
|
222 |
} |
|
223 | 2x |
m$indices <- inds |
224 | 2x |
if (dc.min > 0 || dc.max < Inf) { |
225 | 1x |
su <- m$frequencies > dc.min & m$frequencies < dc.max |
226 | 1x |
if (any(!su)) { |
227 | 1x |
if (!any(su)) { |
228 | ! |
warning( |
229 | ! |
"document count bounds [", dc.min, ", ", dc.max, "] exlcuded all terms, so they were ignored", |
230 | ! |
call. = FALSE |
231 |
) |
|
232 |
} else { |
|
233 | 1x |
m$frequencies <- m$frequencies[su] |
234 | 1x |
ex <- m$tokens[!su] |
235 | 1x |
m$tokens <- m$tokens[su] |
236 | 1x |
new_inds <- structure(seq_along(m$tokens), names = m$tokens) |
237 | 1x |
m$tokens[] <- new_inds |
238 | 1x |
for (i in seq_along(m$indices)) { |
239 | 15x |
inds <- m$indices[[i]] |
240 | 15x |
m$indices[[i]] <- unname(new_inds[as.character(inds[!inds %in% ex])]) |
241 | 15x |
m$WC[i] <- length(m$indices[[i]]) |
242 |
} |
|
243 |
} |
|
244 |
} |
|
245 |
} |
|
246 | 2x |
names(m$frequencies) <- names(m$tokens) |
247 |
} else { |
|
248 | 75x |
msu <- match_terms( |
249 | 75x |
text, words, !grepl("^(?:[[:punct:]]|repellipsis)$", words), |
250 | 75x |
c(length(text), length(words)), is.null(exclude), FALSE |
251 |
) |
|
252 | 75x |
m <- if (sparse) as(msu[[1]], "CsparseMatrix") else as.matrix(msu[[1]]) |
253 | 2x |
if (length(docnames) == nrow(m)) rownames(m) <- docnames |
254 | 75x |
su <- msu[[3]] > dc.min & msu[[3]] < dc.max |
255 | 75x |
names(msu[[3]]) <- words |
256 | 75x |
if (any(!su)) { |
257 | 1x |
if (!any(su)) { |
258 | ! |
warning( |
259 | ! |
"document count bounds [", dc.min, ", ", dc.max, "] exlcuded all terms, so they were ignored", |
260 | ! |
call. = FALSE |
261 |
) |
|
262 |
} else { |
|
263 | 1x |
m <- m[, su, drop = FALSE] |
264 |
} |
|
265 |
} |
|
266 | 75x |
attr(m, "WC") <- unlist(msu[[2]], use.names = FALSE) |
267 | 75x |
attr(m, "colsums") <- msu[[3]] |
268 | 75x |
attr(m, "type") <- "count" |
269 | 75x |
if (!missing(dc.min) || !missing(dc.max)) { |
270 | 1x |
attr(m, "info") <- paste( |
271 | 1x |
"a lim of", dc.min, "and", dc.max, "left", sum(su), "of", length(words), "unique terms" |
272 |
) |
|
273 |
} |
|
274 |
} |
|
275 | 77x |
attr(m, "opts") <- c(numbers = numbers, punct = punct, urls = urls, to.lower = to.lower) |
276 | 77x |
attr(m, "time") <- c(dtm = proc.time()[[3]] - st) |
277 | 77x |
m |
278 |
} |
1 |
#' Initialize Directories for Dictionaries and Latent Semantic Spaces |
|
2 |
#' |
|
3 |
#' Creates directories for dictionaries and latent semantic spaces if needed, sets them as the |
|
4 |
#' \cr \code{lingmatch.dict.dir} and \code{lingmatch.lspace.dir} options if they are not already set, |
|
5 |
#' and creates links to them in their expected locations (\code{'~/Dictionaries'} and |
|
6 |
#' \code{'~/Latent Semantic Spaces'}) by default if applicable. |
|
7 |
#' @param base Path to a directory in which to create the \code{dict} and \code{lspace} subdirectories. |
|
8 |
#' @param dict Path to the dictionaries directory relative to \code{base}. |
|
9 |
#' @param lspace Path to the latent semantic spaces directory relative to \code{base}. |
|
10 |
#' @param link Logical; if \code{TRUE} (default), the full \code{dict} and/or \code{lspace} paths exist |
|
11 |
#' (potentially after being created), and they are not \code{'~/Dictionaries'} or \code{'~/Latent Semantic Spaces'} |
|
12 |
#' respectively, junctions (Windows) or symbolic links will be created: \code{~/Dictionaries} \code{<<===>>} |
|
13 |
#' \code{dict} and \code{~/Latent Semantic Spaces} \code{<<===>>} \code{lspace}. |
|
14 |
#' @return Paths to the [1] dictionaries and [2] latent semantic space directories, or a single path if only |
|
15 |
#' \code{dict} or \code{lspace} is specified. |
|
16 |
#' @examples |
|
17 |
#' \dontrun{ |
|
18 |
#' |
|
19 |
#' # set up the expected dictionary and latent semantic space directories |
|
20 |
#' lma_initdirs("~") |
|
21 |
#' |
|
22 |
#' # set up directories elsewhere, and links to the expected locations |
|
23 |
#' lma_initdirs("d:") |
|
24 |
#' |
|
25 |
#' # point options and create links to preexisting directories |
|
26 |
#' lma_initdirs("~/NLP_Resources", "Dicts", "Dicts/Embeddings") |
|
27 |
#' |
|
28 |
#' # create just a dictionaries directory and set the |
|
29 |
#' # lingmatch.dict.dir option without creating a link |
|
30 |
#' lma_initdirs(dict = "z:/external_dictionaries", link = FALSE) |
|
31 |
#' } |
|
32 |
#' @export |
|
33 | ||
34 |
lma_initdirs <- function(base = "", dict = "Dictionaries", lspace = "Latent Semantic Spaces", link = TRUE) { |
|
35 | 3x |
mck <- c(missing(dict), missing(lspace)) |
36 | 3x |
if (base == "" && all(mck)) { |
37 | ! |
base <- gsub('^[\'"]+|[\'"]+$', "", readline(paste0( |
38 | ! |
"Enter the path to a directory; ~ is recommended: \n", |
39 | ! |
"This is where ", dict, " and ", lspace, " subdirectories will be made." |
40 |
))) |
|
41 | ! |
if (grepl("^(?:cancel|exit|stop|q|x|quit|no|nvm|nevermind)?$", tolower(base))) { |
42 | ! |
stop( |
43 | ! |
"Specify a path to a directory in which you want dictionaries", |
44 | ! |
' and latent semantic spaces to be stored; e.g., "~".', |
45 | ! |
call. = FALSE |
46 |
) |
|
47 |
} |
|
48 |
} |
|
49 | 3x |
dirs <- normalizePath(paste0(base, if (base != "") "/", c(dict, lspace)), "/", FALSE) |
50 | 3x |
names(dirs) <- c("dict", "lspace") |
51 | 2x |
if (!all(mck)) dirs <- dirs[!mck] |
52 | 3x |
if ("dict" %in% names(dirs)) { |
53 | 2x |
if (!dir.exists(dirs[["dict"]])) dir.create(dirs[["dict"]], recursive = TRUE) |
54 | 1x |
if (getOption("lingmatch.dict.dir") == "") options(lingmatch.dict.dir = dirs[["dict"]]) |
55 |
} |
|
56 | 3x |
if ("lspace" %in% names(dirs)) { |
57 | 1x |
if (!dir.exists(dirs[["lspace"]])) dir.create(dirs[["lspace"]], recursive = TRUE) |
58 | 1x |
if (getOption("lingmatch.lspace.dir") == "") options(lingmatch.lspace.dir = dirs[["lspace"]]) |
59 |
} |
|
60 | 3x |
if (link) { |
61 | ! |
linker <- if (Sys.info()[["sysname"]] == "Windows") Sys.junction else file.symlink |
62 | ! |
if (dir.exists(dirs[["dict"]]) && !dir.exists("~/Dictionaries")) { |
63 | ! |
linker(dirs[["dict"]], "~/Dictionaries") |
64 | ! |
message("created dictionaries link:\n ", dirs[["dict"]], " <<==>> ", path.expand("~/Dictionaries")) |
65 |
} |
|
66 | ! |
if (dir.exists(dirs[["lspace"]]) && !dir.exists("~/Latent Semantic Spaces")) { |
67 | ! |
linker(dirs[["lspace"]], "~/Latent Semantic Spaces") |
68 | ! |
message("created latent space link:\n ", dirs[["lspace"]], " <<==>> ", path.expand("~/Latent Semantic Spaces")) |
69 |
} |
|
70 |
} |
|
71 | 3x |
dirs |
72 |
} |
1 |
#' Standardize a Latent Semantic Space |
|
2 |
#' |
|
3 |
#' Reformat a .rda file which has a matrix with terms as row names, or a plain-text embeddings file |
|
4 |
#' which has a term at the start of each line, and consistent delimiting characters. Plain-text files |
|
5 |
#' are processed line-by-line, so large spaces can be reformatted RAM-conservatively. |
|
6 |
#' |
|
7 |
#' @param infile Name of the .rda or plain-text file relative to \code{dir}, \cr |
|
8 |
#' e.g., "default.rda" or "glove/glove.6B.300d.txt". |
|
9 |
#' @param name Base name of the reformatted file and term file; e.g., "glove" would result in |
|
10 |
#' \code{glove.dat} and \code{glove_terms.txt} in \code{outdir}. |
|
11 |
#' @param sep Delimiting character between values in each line, e.g., \code{" "} or \code{"\\t"}. |
|
12 |
#' Only applies to plain-text files. |
|
13 |
#' @param digits Number of digits to round values to; default is 9. |
|
14 |
#' @param dir Path to folder containing \code{infile}s. \cr Default is \code{getOption('lingmatch.lspace.dir')}, |
|
15 |
#' which must be set in the current session. If this is not specified and \code{infile} is a full path, |
|
16 |
#' \code{dir} will be set to \code{infile}'s parent directory. |
|
17 |
#' @param outdir Path to folder in which to save standardized files; default is \code{dir}. |
|
18 |
#' @param remove A string with a regex pattern to be removed from term names \cr (i.e., \code{gsub(remove,} |
|
19 |
#' \code{"", term)}); default is \code{""}, which is ignored. |
|
20 |
#' @param term_check A string with a regex pattern by which to filter terms; i.e., only lines with fully |
|
21 |
#' matched terms are written to the reformatted file. The default attempts to retain only regular words, including |
|
22 |
#' those with dashes, foreword slashes, and periods. Set to an empty string (\code{""}) to write all lines |
|
23 |
#' regardless of term. |
|
24 |
#' @param verbose Logical: if \code{TRUE}, prints the current line number and its term to the console every 1,000 lines. |
|
25 |
#' Only applies to plain-text files. |
|
26 |
#' @family Latent Semantic Space functions |
|
27 |
#' @return Path to the standardized [1] data file and [2] terms file if applicable. |
|
28 |
#' @examples |
|
29 |
#' \dontrun{ |
|
30 |
#' |
|
31 |
#' # from https://sites.google.com/site/fritzgntr/software-resources/semantic_spaces |
|
32 |
#' standardize.lspace("EN_100k_lsa.rda", "100k_lsa") |
|
33 |
#' |
|
34 |
#' # from https://fasttext.cc/docs/en/english-vectors.html |
|
35 |
#' standardize.lspace("crawl-300d-2M.vec", "facebook_crawl") |
|
36 |
#' |
|
37 |
#' # Standardized versions of these spaces can also be downloaded with download.lspace. |
|
38 |
#' } |
|
39 |
#' @export |
|
40 | ||
41 |
standardize.lspace <- function(infile, name, sep = " ", digits = 9, dir = getOption("lingmatch.lspace.dir"), |
|
42 |
outdir = dir, remove = "", term_check = "^[a-zA-Z]+$|^['a-zA-Z][a-zA-Z.'\\/-]*[a-zA-Z.]$", verbose = FALSE) { |
|
43 | 3x |
if (is.character(infile) && file.exists(infile) && missing(dir)) { |
44 | 1x |
dir <- dirname(normalizePath(infile, mustWork = FALSE)) |
45 | 1x |
if (missing(outdir)) outdir <- dir |
46 | 2x |
} else if (dir == "") { |
47 | ! |
if (outdir != "") { |
48 | ! |
dir <- outdir |
49 |
} else { |
|
50 | ! |
stop(paste( |
51 | ! |
"specify a directory (dir), or set the lspace directory option\n(e.g.,", |
52 | ! |
'options(lingmatch.lspace.dir = ~/Latent Semantic Spaces")) or call lma_initdir()' |
53 | ! |
), call. = FALSE) |
54 |
} |
|
55 |
} |
|
56 | ! |
if (!is.character(term_check)) term_check <- "" |
57 | 3x |
if (is.character(infile)) { |
58 | 3x |
ip <- normalizePath(paste0(dir, "/", infile), "/", FALSE) |
59 | 1x |
if (!file.exists(ip)) ip <- infile |
60 |
} |
|
61 | 3x |
op <- normalizePath(paste0(outdir, "/", name), "/", FALSE) |
62 | 3x |
fs <- op |
63 | 3x |
if (!is.character(infile) || grepl("\\.rda$", infile)) { |
64 | 1x |
if (is.character(infile)) { |
65 | 1x |
f <- load(ip) |
66 | 1x |
o <- get(f) |
67 |
} else { |
|
68 | ! |
o <- infile |
69 |
} |
|
70 | 1x |
o <- round(o, digits) |
71 | 1x |
ot <- rownames(o) |
72 | ! |
if (remove != "") ot <- gsub(remove, "", ot) |
73 | 1x |
if (term_check != "") { |
74 | 1x |
su <- grepl(term_check, ot) |
75 | 1x |
o <- o[su, ] |
76 | 1x |
ot <- ot[su] |
77 |
} |
|
78 | 1x |
fs <- paste0(op, c(".dat", "_terms.txt")) |
79 | 1x |
writeLines(ot, fs[2]) |
80 | 1x |
write(formatC(t(o), digits, 0, "f"), fs[1], ncol(o)) |
81 | 1x |
if (is.character(infile)) rm(f, "o") |
82 |
} else { |
|
83 | ! |
if (!file.exists(ip)) stop("infile does not exist: ", ip) |
84 | 2x |
if (!grepl(term_check, scan(ip, "", 1, sep = sep, quiet = TRUE))) { |
85 | ! |
stop("infile does not appear to start with a term: ", ip) |
86 |
} |
|
87 | 2x |
fs <- paste0(op, ".dat") |
88 | 2x |
reformat_embedding(ip, op, sep, digits, remove, term_check, verbose) |
89 |
} |
|
90 | 3x |
message("created ", op, ".dat\nfrom ", ip) |
91 | 3x |
fs |
92 |
} |
1 |
#' Read and Segment Multiple Texts |
|
2 |
#' |
|
3 |
#' Split texts by word count or specific characters. Input texts directly, or read them in from files. |
|
4 |
#' |
|
5 |
#' @param path Path to a folder containing files, or a vector of paths to files. If no folders or files are |
|
6 |
#' recognized in \code{path}, it is treated as \code{text}. |
|
7 |
#' @param segment Specifies how the text of each file should be segmented. If a character, split at that character; |
|
8 |
#' '\\n' by default. If a number, texts will be broken into that many segments, each with a roughly equal number of |
|
9 |
#' words. |
|
10 |
#' @param ext The extension of the files you want to read in. '.txt' by default. |
|
11 |
#' @param subdir Logical; if \code{TRUE}, files in folders in \code{path} will also be included. |
|
12 |
#' @param segment.size Logical; if specified, \code{segment} will be ignored, and texts will be broken into |
|
13 |
#' segments containing roughly \code{segment.size} number of words. |
|
14 |
#' @param bysentence Logical; if \code{TRUE}, and \code{segment} is a number or \code{segment.size} is specified, |
|
15 |
#' sentences will be kept together, rather than potentially being broken across segments. |
|
16 |
#' @param end_in_quotes Logical; if \code{FALSE}, sentence-ending marks (\code{.?!}) will not be considered when |
|
17 |
#' immediately followed by a quotation mark. For example, \code{'"Word." Word.'} would be considered one sentence. |
|
18 |
#' @param preclean Logical; if \code{TRUE}, text will be cleaned with \code{lma_dict(special)} before |
|
19 |
#' segmentation. |
|
20 |
#' @param text A character vector with text to be split, used in place of \code{path}. Each entry is treated as a file. |
|
21 |
#' @return A \code{data.frame} with columns for file names (\code{input}), |
|
22 |
#' segment number within file (\code{segment}), word count for each segment (\code{WC}), and the text of |
|
23 |
#' each segment (\code{text}). |
|
24 |
#' @examples |
|
25 |
#' # split preloaded text |
|
26 |
#' read.segments("split this text into two segments", 2) |
|
27 |
#' |
|
28 |
#' \dontrun{ |
|
29 |
#' |
|
30 |
#' # read in all files from the package directory |
|
31 |
#' texts <- read.segments(path.package("lingmatch"), ext = "") |
|
32 |
#' texts[, -4] |
|
33 |
#' |
|
34 |
#' # segment .txt files in dir in a few ways: |
|
35 |
#' dir <- "path/to/files" |
|
36 |
#' |
|
37 |
#' ## into 1 line segments |
|
38 |
#' texts_lines <- read.segments(dir) |
|
39 |
#' |
|
40 |
#' ## into 5 even segments each |
|
41 |
#' texts_5segs <- read.segments(dir, 5) |
|
42 |
#' |
|
43 |
#' ## into 50 word segments |
|
44 |
#' texts_50words <- read.segments(dir, segment.size = 50) |
|
45 |
#' |
|
46 |
#' ## into 1 sentence segments |
|
47 |
#' texts_1sent <- read.segments(dir, segment.size = 1, bysentence = TRUE) |
|
48 |
#' } |
|
49 |
#' @export |
|
50 | ||
51 |
read.segments <- function(path = ".", segment = NULL, ext = ".txt", subdir = FALSE, segment.size = -1, |
|
52 |
bysentence = FALSE, end_in_quotes = TRUE, preclean = FALSE, text = NULL) { |
|
53 | ! |
if (any(path == "")) path[path == ""] <- "." |
54 | 14x |
if (!any(file.exists(sub("[\\/]+$", "", path)))) { |
55 | 6x |
ck_text <- TRUE |
56 | 6x |
files <- path |
57 |
} else { |
|
58 | 8x |
ck_text <- !is.null(text) |
59 | 8x |
files <- if (ck_text) { |
60 | ! |
text |
61 |
} else { |
|
62 | 8x |
dirs <- list.dirs(path, recursive = subdir) |
63 | 8x |
files <- if (any(dir.exists(path))) { |
64 | 1x |
unique(list.files(path, ext, recursive = subdir, full.names = TRUE)) |
65 |
} else { |
|
66 | 7x |
path[file.exists(path)] |
67 |
} |
|
68 | 8x |
files[!files %in% dirs] |
69 |
} |
|
70 |
} |
|
71 | 14x |
if (missing(segment) && missing(segment.size)) { |
72 | 4x |
segment <- if (length(path) == 1 && (ck_text || !dir.exists(path))) "\n" else 1 |
73 |
} |
|
74 | 14x |
if (length(files)) { |
75 | 14x |
err <- function(e) NULL |
76 | 14x |
args <- list(what = character(), quote = "", na.strings = "", quiet = TRUE) |
77 | ! |
if (is.character(segment) && segment.size == -1) args$sep <- segment |
78 | 14x |
do.call(rbind, lapply(seq_along(files), function(fi) { |
79 | 44x |
f <- files[fi] |
80 | 44x |
args[[if (ck_text) "text" else "file"]] <- f |
81 | 44x |
WC <- NULL |
82 | 44x |
if (ck_text || file.exists(f)) { |
83 | 44x |
if (is.numeric(segment) || segment.size > 0) { |
84 | 44x |
words <- tryCatch(do.call(scan, args), error = err) |
85 | 44x |
if (!length(words)) { |
86 | ! |
return(NULL) |
87 |
} |
|
88 | ! |
if (preclean) words <- lma_dict("special", as.function = gsub)(words) |
89 | 44x |
TWC <- length(words) |
90 | 32x |
if (segment.size == -1) segment.size <- ceiling(TWC / segment) |
91 | 44x |
if (bysentence) { |
92 | 8x |
if (!is.null(segment) && is.numeric(segment)) { |
93 | 4x |
lines <- character(segment) |
94 | 4x |
WC <- numeric(segment) |
95 |
} else { |
|
96 | 4x |
lines <- NULL |
97 | 4x |
WC <- NULL |
98 |
} |
|
99 | 8x |
sentends <- grep("[.?!]$", if (end_in_quotes) { |
100 | 7x |
gsub(if (preclean) { |
101 |
'["\']+' |
|
102 |
} else { |
|
103 | 7x |
paste0("(?:", paste( |
104 | 7x |
c( |
105 |
'["\']', |
|
106 | 7x |
unlist(lma_dict("special")$special$CHARACTERS[c('"', "'")]) |
107 |
), |
|
108 | 7x |
collapse = "|" |
109 |
), ")+$") |
|
110 | 7x |
}, "", words) |
111 |
} else { |
|
112 | 1x |
words |
113 |
}) |
|
114 | 8x |
if (length(sentends)) { |
115 | 6x |
sentends <- sentends[!grepl(paste0( |
116 | 6x |
"\\.[a-z]|^(?:[a-z]|[ivxm]+|\\d+|ans|govt|apt|etc|", |
117 | 6x |
"st|rd|ft|feat|dr|drs|mr|ms|mrs|messrs|jr|prof)[.?!]$" |
118 | 6x |
), words[sentends], TRUE, TRUE)] |
119 |
} |
|
120 | 8x |
sentends <- c(1, sentends) |
121 | 8x |
nsents <- length(sentends) |
122 | 8x |
if (sentends[nsents] != TWC) { |
123 | 3x |
sentends <- c(sentends, TWC) |
124 | 3x |
nsents <- nsents + 1 |
125 |
} |
|
126 | 8x |
i <- s <- p <- 1 |
127 | 8x |
while (p < nsents && sum(WC) < TWC) { |
128 | 17x |
WC[i] <- 0 |
129 | 17x |
while (p < nsents && WC[i] < segment.size) { |
130 | 56x |
p <- p + 1 |
131 | 56x |
WC[i] <- (sentends[p] - s) + 1 |
132 |
} |
|
133 | 17x |
lines[i] <- paste(words[seq(s, sentends[p])], collapse = " ") |
134 | 17x |
s <- sentends[p] + 1 |
135 | 17x |
i <- i + 1 |
136 |
} |
|
137 |
} else { |
|
138 | 36x |
segment <- ceiling(TWC / segment.size) |
139 | 36x |
lines <- character(segment) |
140 | 36x |
WC <- rep(segment.size, segment) |
141 | 36x |
WCC <- 0 |
142 | 36x |
for (i in seq_len(segment)) { |
143 | 14x |
if (WCC + WC[i] > TWC) WC[i] <- TWC - WCC |
144 | 92x |
lines[i] <- paste(words[seq(WCC + 1, WCC + WC[i])], collapse = " ") |
145 | 92x |
WCC <- WCC + WC[i] |
146 |
} |
|
147 |
} |
|
148 |
} else { |
|
149 | ! |
lines <- tryCatch(do.call(scan, args), error = err) |
150 | ! |
if (!length(lines)) { |
151 | ! |
return(NULL) |
152 |
} |
|
153 |
} |
|
154 |
} else { |
|
155 | ! |
lines <- "" |
156 |
} |
|
157 | 44x |
data.frame( |
158 | 44x |
input = if (ck_text) fi else f, segment = seq_along(lines), |
159 | 44x |
WC = if (is.null(WC)) vapply(strsplit(lines, "\\s+"), function(sp) sum(sp != ""), 0) else WC, |
160 | 44x |
text = lines, stringsAsFactors = FALSE |
161 |
) |
|
162 |
})) |
|
163 |
} else { |
|
164 | ! |
warning( |
165 | ! |
"no files found", if (!subdir) "; might try setting subdir to TRUE to include files in folders" |
166 |
) |
|
167 |
} |
|
168 |
} |
1 |
#' Similarity Calculations |
|
2 |
#' |
|
3 |
#' Enter a numerical matrix, set of vectors, or set of matrices to calculate similarity per vector. |
|
4 |
#' |
|
5 |
#' @param a A vector or matrix. If a vector, \code{b} must also be provided. If a matrix and \code{b} |
|
6 |
#' is missing, each row will be compared. If a matrix and \code{b} is not missing, each row will |
|
7 |
#' be compared with \code{b} or each row of \code{b}. |
|
8 |
#' @param b A vector or matrix to be compared with \code{a} or rows of \code{a}. |
|
9 |
#' @param metric A character or vector of characters at least partially matching one of the |
|
10 |
#' available metric names (or 'all' to explicitly include all metrics), |
|
11 |
#' or a number or vector of numbers indicating the metric by index: |
|
12 |
#' \itemize{ |
|
13 |
#' \item \strong{\code{jaccard}}: \code{sum(a & b) / sum(a | b)} |
|
14 |
#' \item \strong{\code{euclidean}}: \code{1 / (1 + sqrt(sum((a - b) ^ 2)))} |
|
15 |
#' \item \strong{\code{canberra}}: \code{mean(1 - abs(a - b) / (a + b))} |
|
16 |
#' \item \strong{\code{cosine}}: \code{sum(a * b) / sqrt(sum(a ^ 2 * sum(b ^ 2)))} |
|
17 |
#' \item \strong{\code{pearson}}: \code{(mean(a * b) - (mean(a) * mean(b))) /} \cr |
|
18 |
#' \code{sqrt(mean(a ^ 2) - mean(a) ^ 2) / sqrt(mean(b ^ 2) - mean(b) ^ 2)} |
|
19 |
#' } |
|
20 |
#' @param group If \code{b} is missing and \code{a} has multiple rows, this will be used to make |
|
21 |
#' comparisons between rows of \code{a}, as modified by \code{agg} and \code{agg.mean}. |
|
22 |
#' @param lag Amount to adjust the \code{b} index; either rows if \code{b} has multiple rows (e.g., |
|
23 |
#' for \code{lag = 1}, \code{a[1, ]} is compared with \code{b[2, ]}), or values otherwise (e.g., |
|
24 |
#' for \code{lag = 1}, \code{a[1]} is compared with \code{b[2]}). If \code{b} is not supplied, |
|
25 |
#' \code{b} is a copy of \code{a}, resulting in lagged self-comparisons or autocorrelations. |
|
26 |
#' @param agg Logical: if \code{FALSE}, only the boundary rows between groups will be compared, see |
|
27 |
#' example. |
|
28 |
#' @param agg.mean Logical: if \code{FALSE} aggregated rows are summed instead of averaged. |
|
29 |
#' @param pairwise Logical: if \code{FALSE} and \code{a} and \code{b} are matrices with the same number of |
|
30 |
#' rows, only paired rows are compared. Otherwise (and if only \code{a} is supplied), all pairwise |
|
31 |
#' comparisons are made. |
|
32 |
#' @param symmetrical Logical: if \code{TRUE} and pairwise comparisons between \code{a} rows were made, |
|
33 |
#' the results in the lower triangle are copied to the upper triangle. |
|
34 |
#' @param mean Logical: if \code{TRUE}, a single mean for each metric is returned per row of \code{a}. |
|
35 |
#' @param return.list Logical: if \code{TRUE}, a list-like object will always be returned, with an entry |
|
36 |
#' for each metric, even when only one metric is requested. |
|
37 |
#' @details |
|
38 |
#' Use \code{\link[RcppParallel]{setThreadOptions}} to change parallelization options; e.g., run |
|
39 |
#' RcppParallel::setThreadOptions(4) before a call to lma_simets to set the number of CPU |
|
40 |
#' threads to 4. |
|
41 |
#' @return Output varies based on the dimensions of \code{a} and \code{b}: |
|
42 |
#' \itemize{ |
|
43 |
#' \item \strong{Out:} A vector with a value per metric. \cr |
|
44 |
#' \strong{In:} Only when \code{a} and \code{b} are both vectors. |
|
45 |
#' \item \strong{Out:} A vector with a value per row. \cr |
|
46 |
#' \strong{In:} Any time a single value is expected per row: \code{a} or \code{b} is a vector, |
|
47 |
#' \code{a} and \code{b} are matrices with the same number of rows and \code{pairwise = FALSE}, a group is |
|
48 |
#' specified, or \code{mean = TRUE}, and only one metric is requested. |
|
49 |
#' \item \strong{Out:} A data.frame with a column per metric. \cr |
|
50 |
#' \strong{In:} When multiple metrics are requested in the previous case. |
|
51 |
#' \item \strong{Out:} A sparse matrix with a \code{metric} attribute with the metric name. \cr |
|
52 |
#' \strong{In:} Pairwise comparisons within an \code{a} matrix or between |
|
53 |
#' an \code{a} and \code{b} matrix, when only 1 metric is requested. |
|
54 |
#' \item \strong{Out:} A list with a sparse matrix per metric. \cr |
|
55 |
#' \strong{In:} When multiple metrics are requested in the previous case. |
|
56 |
#' } |
|
57 |
#' @examples |
|
58 |
#' text <- c( |
|
59 |
#' "words of speaker A", "more words from speaker A", |
|
60 |
#' "words from speaker B", "more words from speaker B" |
|
61 |
#' ) |
|
62 |
#' (dtm <- lma_dtm(text)) |
|
63 |
#' |
|
64 |
#' # compare each entry |
|
65 |
#' lma_simets(dtm) |
|
66 |
#' |
|
67 |
#' # compare each entry with the mean of all entries |
|
68 |
#' lma_simets(dtm, colMeans(dtm)) |
|
69 |
#' |
|
70 |
#' # compare by group (corresponding to speakers and turns in this case) |
|
71 |
#' speaker <- c("A", "A", "B", "B") |
|
72 |
#' |
|
73 |
#' ## by default, consecutive rows from the same group are averaged: |
|
74 |
#' lma_simets(dtm, group = speaker) |
|
75 |
#' |
|
76 |
#' ## with agg = FALSE, only the rows at the boundary between |
|
77 |
#' ## groups (rows 2 and 3 in this case) are used: |
|
78 |
#' lma_simets(dtm, group = speaker, agg = FALSE) |
|
79 |
#' @export |
|
80 | ||
81 |
lma_simets <- function(a, b = NULL, metric = NULL, group = NULL, lag = 0, agg = TRUE, agg.mean = TRUE, |
|
82 |
pairwise = TRUE, symmetrical = FALSE, mean = FALSE, return.list = FALSE) { |
|
83 | 515x |
mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson") |
84 | 515x |
if (missing(metric) && length(b) == 1 && !grepl(" ", b) && |
85 | 515x |
any(grepl(tolower(substr(b, 1, 3)), mets, fixed = TRUE))) { |
86 | 8x |
metric <- b |
87 | 8x |
b <- NULL |
88 |
} |
|
89 | 515x |
met <- match_metric(metric) |
90 | 515x |
if (!length(met$selected)) { |
91 | ! |
stop( |
92 | ! |
"no recognized metric; should match one of ", |
93 | ! |
paste0(mets, collapse = ", "), ", or all" |
94 |
) |
|
95 |
} |
|
96 | 515x |
st <- proc.time()[[3]] |
97 | 515x |
slots <- c("i", "p", "x", "Dim") |
98 | 515x |
if ((is.character(a) || is.factor(a)) && any(grepl("[a-zA-Z]", a))) { |
99 | 2x |
a <- lma_dtm(a) |
100 | 75x |
} else if (is.data.frame(a)) a <- Matrix(as.matrix(a), sparse = TRUE) |
101 | 1x |
if (is.null(b) && !missing(lag) && is.null(dim(a))) b <- a |
102 | 515x |
if (is.null(b)) { |
103 | 114x |
n <- dim(a)[1] |
104 | ! |
if (is.null(n) || n < 2) stop("a must have more than 1 row when b is not provided", call. = FALSE) |
105 | 114x |
if (is.null(group)) { |
106 | 63x |
if (!all(slots %in% slotNames(a))) a <- as(a, "CsparseMatrix") |
107 | 77x |
res <- calculate_similarities(a, NULL, 2, met$dummy) |
108 | 77x |
for (i in seq_along(res)) attr(res[[i]], "metric") <- met$selected[i] |
109 |
} else { |
|
110 | ! |
if (length(group) != n) stop("group is not the same length as a or columns in a") |
111 | 37x |
ager <- if (agg.mean) colMeans else colSums |
112 | 37x |
l <- length(group) |
113 | 37x |
chunks <- NULL |
114 | 37x |
i <- 1 |
115 | 37x |
while (i < l) { |
116 | 193x |
st <- i |
117 | 193x |
g <- group[i] |
118 | 193x |
while (i < l && g == group[i + 1]) i <- i + 1 |
119 | 193x |
chunks <- c(chunks, list(seq(st, i))) |
120 | 193x |
i <- i + 1 |
121 |
} |
|
122 | 25x |
if (!any(chunks[[length(chunks)]] == l)) chunks <- c(chunks, list(l)) |
123 | 37x |
rows <- character(length(chunks) - 1) |
124 | 37x |
res <- as.data.frame(matrix(0, length(chunks) - 1, sum(met$dummy), dimnames = list(NULL, met$selected))) |
125 | 37x |
for (i in seq_len(length(chunks) - 1)) { |
126 | 181x |
s <- chunks[[i]] |
127 | 181x |
sa <- if (agg) s else s[length(s)] |
128 | 181x |
ta <- ager(a[sa, , drop = FALSE]) |
129 | 181x |
s <- chunks[[i + 1]] |
130 | 181x |
sb <- if (agg) s else s[1] |
131 | 181x |
tb <- ager(a[sb, , drop = FALSE]) |
132 | 181x |
res[i, ] <- vector_similarity(ta, tb, met$dummy) |
133 | 181x |
rows[i] <- paste(paste(sa, collapse = ", "), "<->", paste(sb, collapse = ", ")) |
134 |
} |
|
135 | 37x |
rownames(res) <- rows |
136 |
} |
|
137 |
} else { |
|
138 | 401x |
if ((is.character(b) || is.factor(b)) && any(grepl("[a-zA-Z]", b))) { |
139 | 1x |
b <- lma_dtm(b) |
140 | 7x |
} else if (is.data.frame(b)) b <- Matrix(as.matrix(b), sparse = TRUE) |
141 | 401x |
bn <- if (is.null(dim(b))) length(b) else dim(b)[1] |
142 | 4x |
if (lag && abs(lag) >= bn) lag <- if (lag < 0) -bn + 1 else bn - 1 |
143 | 401x |
res <- if (is.null(dim(b)) && length(a) == bn && (is.null(dim(a)) || any(dim(a) == 1))) { |
144 | 247x |
b <- as.numeric(b) |
145 | 5x |
if (lag) b <- if (lag < 0) c(b[-seq_len(-lag)], numeric(-lag)) else c(numeric(lag), b)[seq_len(bn)] |
146 | 247x |
vector_similarity(as.numeric(a), b, met$dummy) |
147 |
} else { |
|
148 | 2x |
if (is.null(dim(a))) a <- Matrix(a, 1, dimnames = list(NULL, names(a)), sparse = TRUE) |
149 | 85x |
if (!all(slots %in% slotNames(a))) a <- as(a, "CsparseMatrix") |
150 | 76x |
if (is.null(dim(b))) b <- Matrix(b, 1, dimnames = list(NULL, names(b)), sparse = TRUE) |
151 | 28x |
if (!all(slots %in% slotNames(b))) b <- as(b, "CsparseMatrix") |
152 | 154x |
d <- c(dim(a), dim(b)) |
153 | 154x |
if (d[2] != d[4]) { |
154 | 3x |
ns <- colnames(a) |
155 | 3x |
if (!is.null(ns)) { |
156 | 3x |
ns <- ns[ns %in% colnames(b)] |
157 | 3x |
if (length(ns)) { |
158 | 3x |
a <- a[, ns, drop = FALSE] |
159 | 3x |
b <- b[, ns, drop = FALSE] |
160 |
} |
|
161 |
} |
|
162 | 3x |
d <- c(dim(a), dim(b)) |
163 | 3x |
if (d[2] != d[4]) { |
164 | ! |
stop("a and b have a different number of columns, which could not be aligned by name") |
165 |
} |
|
166 |
} |
|
167 | 154x |
if (lag) { |
168 | 4x |
b <- if (lag > 0) { |
169 | 2x |
rbind(Matrix(0, lag, d[4], sparse = TRUE), b[-(seq_len(lag) + d[3] - lag), ]) |
170 |
} else { |
|
171 | 2x |
rbind(b[-seq_len(-lag), ], Matrix(0, -lag, d[4], sparse = TRUE)) |
172 |
} |
|
173 |
} |
|
174 | 154x |
type <- if (((missing(pairwise) || !pairwise) && d[1] == d[3]) || |
175 | 154x |
d[3] == 1) { |
176 | 105x |
1 |
177 |
} else { |
|
178 | 49x |
3 |
179 |
} |
|
180 | 154x |
calculate_similarities(a, b, type, met$dummy) |
181 |
} |
|
182 |
} |
|
183 | 515x |
if ("list" %in% class(res) && length(res)) { |
184 | 231x |
pairwise <- "dtCMatrix" %in% class(res[[1]]) |
185 | 231x |
if ((pairwise && symmetrical) || mean) { |
186 | 33x |
for (i in seq_along(res)) { |
187 | 32x |
if (pairwise && (symmetrical || mean)) res[[i]] <- forceSymmetric(res[[i]], "L") |
188 | 33x |
if (mean) { |
189 | 3x |
res[[i]] <- if (is.null(dim(res[[i]]))) { |
190 | ! |
mean(res[[i]], na.rm = TRUE) |
191 |
} else { |
|
192 | 3x |
(rowSums(res[[i]], TRUE) - 1) / (ncol(res[[i]]) - 1) |
193 |
} |
|
194 |
} |
|
195 |
} |
|
196 |
} |
|
197 | 231x |
if (is.null(dim(res[[1]]))) { |
198 | 111x |
rn <- if (!is.na(nd <- which(c(dim(a), dim(b)) == length(res[[1]]))[1]) && !is.null(rownames(if (nd == 1) a else b))) { |
199 | 37x |
rownames(if (nd == 1) a else b) |
200 |
} else { |
|
201 | 74x |
NULL |
202 |
} |
|
203 | 111x |
if (length(met$selected) == 1) { |
204 | 37x |
if (length(rn) == length(res[[1]])) names(res[[1]]) <- rn |
205 |
} else { |
|
206 | 17x |
attr(res, "row.names") <- if (length(rn) == length(res[[1]])) rn else seq_along(res[[1]]) |
207 | 17x |
attr(res, "class") <- "data.frame" |
208 |
} |
|
209 |
} |
|
210 | 210x |
if (!return.list && length(met$selected) == 1) res <- res[[1]] |
211 |
} |
|
212 | 515x |
attr(res, "time") <- c(simets = proc.time()[[3]] - st) |
213 | 515x |
res |
214 |
} |
1 |
#' Select Latent Semantic Spaces |
|
2 |
#' |
|
3 |
#' Retrieve information and links to latent semantic spaces |
|
4 |
#' (sets of word vectors/embeddings) available at \href{https://osf.io/489he}{osf.io/489he}, |
|
5 |
#' and optionally download their term mappings (\href{https://osf.io/xr7jv}{osf.io/xr7jv}). |
|
6 |
#' |
|
7 |
#' @param query A character used to select spaces, based on names or other features. |
|
8 |
#' If length is over 1, \code{get.map} is set to \code{TRUE}. Use \code{terms} alone to select |
|
9 |
#' spaces based on term coverage. |
|
10 |
#' @param dir Path to a directory containing \code{lma_term_map.rda} and downloaded spaces; \cr will look in |
|
11 |
#' \code{getOption('lingmatch.lspace.dir')} and \code{'~/Latent Semantic Spaces'} by default. |
|
12 |
#' @param terms A character vector of terms to search for in the downloaded term map, to calculate |
|
13 |
#' coverage of spaces, or select by coverage if \code{query} is not specified. |
|
14 |
#' @param get.map Logical; if \code{TRUE} and \code{lma_term_map.rda} is not found in |
|
15 |
#' \code{dir}, the term map (\href{https://osf.io/xr7jv}{lma_term_map.rda}) is |
|
16 |
#' downloaded and decompressed. |
|
17 |
#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF, |
|
18 |
#' and compares it with that calculated from the downloaded file to check its integrity. |
|
19 |
#' @param mode Passed to \code{\link{download.file}} when downloading the term map. |
|
20 |
#' @return A list with varying entries: |
|
21 |
#' \itemize{ |
|
22 |
#' \item \strong{\code{info}}: The version of \href{https://osf.io/9yzca}{osf.io/9yzca} stored internally; a |
|
23 |
#' \code{data.frame} with spaces as row names, and information about each space in columns: |
|
24 |
#' \itemize{ |
|
25 |
#' \item \strong{\code{terms}}: number of terms in the space |
|
26 |
#' \item \strong{\code{corpus}}: corpus(es) on which the space was trained |
|
27 |
#' \item \strong{\code{model}}: model from which the space was trained |
|
28 |
#' \item \strong{\code{dimensions}}: number of dimensions in the model (columns of the space) |
|
29 |
#' \item \strong{\code{model_info}}: some parameter details about the model |
|
30 |
#' \item \strong{\code{original_max}}: maximum value used to normalize the space; the original |
|
31 |
#' space would be \code{(vectors *} \code{original_max) /} \code{100} |
|
32 |
#' \item \strong{\code{osf_dat}}: OSF id for the \code{.dat} files; the URL would be |
|
33 |
#' https://osf.io/\code{osf_dat} |
|
34 |
#' \item \strong{\code{osf_terms}}: OSF id for the \code{_terms.txt} files; the URL would be |
|
35 |
#' https://osf.io/\code{osf_terms} |
|
36 |
#' \item \strong{\code{wiki}}: link to the wiki for the space |
|
37 |
#' \item \strong{\code{downloaded}}: path to the \code{.dat} file if downloaded, |
|
38 |
#' and \code{''} otherwise. |
|
39 |
#' } |
|
40 |
#' \item \strong{\code{selected}}: A subset of \code{info} selected by \code{query}. |
|
41 |
#' \item \strong{\code{term_map}}: If \code{get.map} is \code{TRUE} or \code{lma_term_map.rda} is found in |
|
42 |
#' \code{dir}, a copy of \href{https://osf.io/xr7jv}{osf.io/xr7jv}, which has space names as |
|
43 |
#' column names, terms as row names, and indices as values, with 0 indicating the term is not |
|
44 |
#' present in the associated space. |
|
45 |
#' } |
|
46 |
#' @family Latent Semantic Space functions |
|
47 |
#' @examples |
|
48 |
#' # just retrieve information about available spaces |
|
49 |
#' spaces <- select.lspace() |
|
50 |
#' spaces$info[1:10, c("terms", "dimensions", "original_max")] |
|
51 |
#' |
|
52 |
#' # retrieve all spaces that used word2vec |
|
53 |
#' w2v_spaces <- select.lspace("word2vec")$selected |
|
54 |
#' w2v_spaces[, c("terms", "dimensions", "original_max")] |
|
55 |
#' |
|
56 |
#' \dontrun{ |
|
57 |
#' |
|
58 |
#' # select spaces by terms |
|
59 |
#' select.lspace(terms = c( |
|
60 |
#' "part-time", "i/o", "'cause", "brexit", "debuffs" |
|
61 |
#' ))$selected[, c("terms", "coverage")] |
|
62 |
#' } |
|
63 |
#' @export |
|
64 | ||
65 |
select.lspace <- function(query = NULL, dir = getOption("lingmatch.lspace.dir"), terms = NULL, |
|
66 |
get.map = FALSE, check.md5 = TRUE, mode = "wb") { |
|
67 | 16x |
if (ckd <- dir == "") dir <- "~/Latent Semantic Spaces" |
68 | 17x |
if (!missing(query) && !is.character(query) && !is.null(colnames(query))) { |
69 | ! |
terms <- colnames(query) |
70 | ! |
query <- NULL |
71 |
} |
|
72 | 17x |
map_path <- normalizePath(paste0(dir, "/lma_term_map.rda"), "/", FALSE) |
73 | 5x |
if (missing(get.map) && (file.exists(map_path) || length(terms) > 1)) get.map <- TRUE |
74 | 17x |
if (!exists("lma_term_map")) lma_term_map <- NULL |
75 | ! |
if (get.map && ckd && !dir.exists(dir)) stop("specify `dir` or use `lma_initdirs()` to download the term map") |
76 | 17x |
if (get.map && !(file.exists(map_path) || !is.null(lma_term_map))) { |
77 | 1x |
fi <- tryCatch( |
78 | 1x |
strsplit(readLines("https://api.osf.io/v2/files/xr7jv", 1, TRUE, FALSE, "utf-8"), '[:,{}"]+')[[1]], |
79 | 1x |
error = function(e) NULL |
80 |
) |
|
81 | 1x |
if (!file.exists(map_path) || (!is.null(fi) && md5sum(map_path) != fi[which(fi == "md5") + 1])) { |
82 | 1x |
dir.create(dir, FALSE, TRUE) |
83 | 1x |
status <- tryCatch(download.file( |
84 | 1x |
"https://osf.io/download/xr7jv", map_path, |
85 | 1x |
mode = mode |
86 | 1x |
), error = function(e) 1) |
87 | 1x |
if (!status && check.md5 && !is.null(fi)) { |
88 | 1x |
ck <- md5sum(map_path) |
89 | 1x |
if (fi[which(fi == "md5") + 1] == ck) { |
90 | 1x |
load(map_path) |
91 | 1x |
save(lma_term_map, file = map_path, compress = FALSE) |
92 |
} else { |
|
93 | ! |
warning(paste0( |
94 | ! |
"The term map's MD5 (", ck, ") does not seem to match the one on record;\n", |
95 | ! |
"double check and try manually downloading at https://osf.io/xr7jv/?show=revision" |
96 |
)) |
|
97 |
} |
|
98 |
} |
|
99 |
} |
|
100 | 16x |
} else if (!file.exists(map_path) && !is.null(terms)) { |
101 | ! |
stop("The term map could not be found; specify dir or run lma_initdirs('~') to download it", call. = FALSE) |
102 |
} |
|
103 | 17x |
r <- list(info = lss_info, selected = lss_info[NULL, ]) |
104 | 17x |
r$info[, "wiki"] <- paste0("https://osf.io/489he/wiki/", rownames(lss_info)) |
105 | 17x |
r$info[, "downloaded"] <- normalizePath(paste0(dir, "/", rownames(r$info), ".dat"), "/", FALSE) |
106 | 17x |
r$info[!file.exists(r$info[, "downloaded"]), "downloaded"] <- "" |
107 | 17x |
if (get.map) { |
108 | 10x |
if (!is.null(lma_term_map)) { |
109 | 1x |
r$term_map <- lma_term_map |
110 | 9x |
} else if (file.exists(map_path) && is.null(lma_term_map)) { |
111 | 9x |
load(map_path) |
112 | 9x |
r$term_map <- lma_term_map |
113 | 9x |
rm(list = "lma_term_map") |
114 |
} |
|
115 |
} |
|
116 | 17x |
if (!is.null(terms)) { |
117 | 3x |
if (length(terms) > 1 && "term_map" %in% names(r)) { |
118 | 3x |
terms <- tolower(terms) |
119 | 3x |
overlap <- terms[terms %in% rownames(r$term_map)] |
120 | 3x |
if (length(overlap)) { |
121 | 3x |
r$info$coverage <- colSums(r$term_map[overlap, , drop = FALSE] != 0) / length(terms) |
122 | 3x |
r$selected <- r$info[order(r$info$coverage, decreasing = TRUE)[1:5], ] |
123 | 3x |
r$space_terms <- overlap |
124 |
} else { |
|
125 | ! |
warning("no terms were found") |
126 |
} |
|
127 |
} |
|
128 |
} |
|
129 | 17x |
if (!is.null(query)) { |
130 | 8x |
query <- paste0(query, collapse = "|") |
131 | 8x |
if (!length(sel <- grep(query, rownames(lss_info), TRUE))) { |
132 | 3x |
collapsed <- vapply( |
133 | 3x |
seq_len(nrow(lss_info)), |
134 | 3x |
function(r) paste(c(rownames(lss_info)[r], lss_info[r, ]), collapse = " "), "" |
135 |
) |
|
136 | 3x |
if (!length(sel <- grep(query, collapsed, TRUE))) { |
137 | 2x |
sel <- grep(paste(strsplit(query, "[[:space:],|]+")[[1]], collapse = "|"), collapsed, TRUE) |
138 |
} |
|
139 |
} |
|
140 | 8x |
if (length(sel)) r$selected <- r$info[sel, ] |
141 |
} |
|
142 | 17x |
r |
143 |
} |
1 |
#' Select Dictionaries |
|
2 |
#' |
|
3 |
#' Retrieve information and links to dictionaries |
|
4 |
#' (lexicons/word lists) available at \href{https://osf.io/y6g5b}{osf.io/y6g5b}. |
|
5 |
#' |
|
6 |
#' @param query A character matching a dictionary name, or a set of keywords to search for in |
|
7 |
#' dictionary information. |
|
8 |
#' @param dir Path to a folder containing dictionaries, or where you want them to be saved. |
|
9 |
#' Will look in getOption('lingmatch.dict.dir') and '~/Dictionaries' by default. |
|
10 |
#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF, |
|
11 |
#' and compares it with that calculated from the downloaded file to check its integrity. |
|
12 |
#' @param mode Passed to \code{\link{download.file}} when downloading files. |
|
13 |
#' @return A list with varying entries: |
|
14 |
#' \itemize{ |
|
15 |
#' \item \strong{\code{info}}: The version of \href{https://osf.io/kjqb8}{osf.io/kjqb8} stored internally; a |
|
16 |
#' \code{data.frame} with dictionary names as row names, and information about each dictionary in columns. \cr |
|
17 |
#' Also described at |
|
18 |
#' \href{https://osf.io/y6g5b/wiki/dict_variables}{osf.io/y6g5b/wiki/dict_variables}, |
|
19 |
#' here \code{short} (corresponding to the file name [\code{{short}.(csv|dic)}] and |
|
20 |
#' wiki urls [\code{https://osf.io/y6g5b/wiki/{short}}]) is set as row names and removed: |
|
21 |
#' \itemize{ |
|
22 |
#' \item \strong{\code{name}}: Full name of the dictionary. |
|
23 |
#' \item \strong{\code{description}}: Description of the dictionary, relating to its purpose and |
|
24 |
#' development. |
|
25 |
#' \item \strong{\code{note}}: Notes about processing decisions that additionally alter the original. |
|
26 |
#' \item \strong{\code{constructor}}: How the dictionary was constructed: |
|
27 |
#' \itemize{ |
|
28 |
#' \item \strong{\code{algorithm}}: Terms were selected by some automated process, potentially |
|
29 |
#' learned from data or other resources. |
|
30 |
#' \item \strong{\code{crowd}}: Several individuals rated the terms, and in aggregate those ratings |
|
31 |
#' translate to categories and weights. |
|
32 |
#' \item \strong{\code{mixed}}: Some combination of the other methods, usually in some iterative |
|
33 |
#' process. |
|
34 |
#' \item \strong{\code{team}}: One of more individuals make decisions about term inclusions, |
|
35 |
#' categories, and weights. |
|
36 |
#' } |
|
37 |
#' \item \strong{\code{subject}}: Broad, rough subject or purpose of the dictionary: |
|
38 |
#' \itemize{ |
|
39 |
#' \item \strong{\code{emotion}}: Terms relate to emotions, potentially exemplifying or expressing |
|
40 |
#' them. |
|
41 |
#' \item \strong{\code{general}}: A large range of categories, aiming to capture the content of the |
|
42 |
#' text. |
|
43 |
#' \item \strong{\code{impression}}: Terms are categorized and weighted based on the impression they |
|
44 |
#' might give. |
|
45 |
#' \item \strong{\code{language}}: Terms are categorized or weighted based on their linguistic |
|
46 |
#' features, such as part of speech, specificity, or area of use. |
|
47 |
#' \item \strong{\code{social}}: Terms relate to social phenomena, such as characteristics or concerns |
|
48 |
#' of social entities. |
|
49 |
#' } |
|
50 |
#' \item \strong{\code{terms}}: Number of unique terms across categories. |
|
51 |
#' \item \strong{\code{term_type}}: Format of the terms: |
|
52 |
#' \itemize{ |
|
53 |
#' \item \strong{\code{glob}}: Include asterisks which denote inclusion of any characters until a |
|
54 |
#' word boundary. |
|
55 |
#' \item \strong{\code{glob+}}: Glob-style asterisks with regular expressions within terms. |
|
56 |
#' \item \strong{\code{ngram}}: Includes any number of words as a term, separated by spaces. |
|
57 |
#' \item \strong{\code{pattern}}: A string of characters, potentially within or between words, or |
|
58 |
#' spanning words. |
|
59 |
#' \item \strong{\code{regex}}: Regular expressions. |
|
60 |
#' \item \strong{\code{stem}}: Unigrams with common endings removed. |
|
61 |
#' \item \strong{\code{unigram}}: Complete single words. |
|
62 |
#' } |
|
63 |
#' \item \strong{\code{weighted}}: Indicates whether weights are associated with terms. This |
|
64 |
#' determines the file type of the dictionary: dictionaries with weights are stored |
|
65 |
#' as .csv, and those without are stored as .dic files. |
|
66 |
#' \item \strong{\code{regex_characters}}: Logical indicating whether special regular expression |
|
67 |
#' characters are present in any term, which might need to be escaped if the terms are used |
|
68 |
#' in regular expressions. Glob-type terms allow complete parens (at least one open and one |
|
69 |
#' closed, indicating preceding or following words), and initial and terminal asterisks. For |
|
70 |
#' all other terms, \code{[](){}*.^$+?\|} are counted as regex characters. These could be |
|
71 |
#' escaped in R with \code{gsub('([][)(}{*.^$+?\\\\|])', '\\\\\\1', terms)} if \code{terms} |
|
72 |
#' is a character vector, and in Python with (importing re) |
|
73 |
#' \code{[re.sub(r'([][(){}*.^$+?\|])', r'\\\1', term)} \code{for term in terms]} if \code{terms} |
|
74 |
#' is a list. |
|
75 |
#' \item \strong{\code{categories}}: Category names in the order in which they appear in the dictionary |
|
76 |
#' file, separated by commas. |
|
77 |
#' \item \strong{\code{ncategories}}: Number of categories. |
|
78 |
#' \item \strong{\code{original_max}}: Maximum value of the original dictionary before standardization: |
|
79 |
#' \code{original values / max(original values) * 100}. Dictionaries with no weights are |
|
80 |
#' considered to have a max of \code{1}. |
|
81 |
#' \item \strong{\code{osf}}: ID of the file on OSF, translating to the file's URL: |
|
82 |
#' https://osf.io/\code{osf}. |
|
83 |
#' \item \strong{\code{wiki}}: URL of the dictionary's wiki. |
|
84 |
#' \item \strong{\code{downloaded}}: Path to the file if downloaded, and \code{''} otherwise. |
|
85 |
#' } |
|
86 |
#' \item \strong{\code{selected}}: A subset of \code{info} selected by \code{query}. |
|
87 |
#' } |
|
88 |
#' @family Dictionary functions |
|
89 |
#' @examples |
|
90 |
#' # just retrieve information about available dictionaries |
|
91 |
#' dicts <- select.dict()$info |
|
92 |
#' dicts[1:10, 4:9] |
|
93 |
#' |
|
94 |
#' # select all dictionaries mentioning sentiment or emotion |
|
95 |
#' sentiment_dicts <- select.dict("sentiment emotion")$selected |
|
96 |
#' sentiment_dicts[1:10, 4:9] |
|
97 |
#' @export |
|
98 | ||
99 |
select.dict <- function(query = NULL, dir = getOption("lingmatch.dict.dir"), |
|
100 |
check.md5 = TRUE, mode = "wb") { |
|
101 | 6x |
if (dir == "") dir <- "~/Dictionaries" |
102 | 7x |
r <- list(info = dict_info, selected = dict_info[NULL, ]) |
103 | 7x |
r$info[, "wiki"] <- paste0("https://osf.io/y6g5b/wiki/", rownames(dict_info)) |
104 | 7x |
r$info[, "downloaded"] <- normalizePath(paste0( |
105 | 7x |
dir, "/", rownames(r$info), ifelse(r$info$weighted, ".csv", ".dic") |
106 | 7x |
), "/", FALSE) |
107 | 7x |
r$info[!file.exists(r$info[, "downloaded"]), "downloaded"] <- "" |
108 | 7x |
if (!missing(query)) { |
109 | 6x |
query <- paste0(query, collapse = "|") |
110 | 6x |
if (!length(sel <- grep(query, rownames(dict_info), TRUE))) { |
111 | 2x |
collapsed <- vapply( |
112 | 2x |
seq_len(nrow(dict_info)), |
113 | 2x |
function(r) paste(c(rownames(dict_info)[r], dict_info[r, ]), collapse = " "), "" |
114 |
) |
|
115 | 2x |
if (!length(sel <- grep(query, collapsed, TRUE))) { |
116 | 1x |
sel <- grep(paste(strsplit(query, "[[:space:],|]+")[[1]], collapse = "|"), collapsed, TRUE) |
117 |
} |
|
118 |
} |
|
119 | 5x |
if (length(sel)) r$selected <- r$info[sel, , drop = FALSE] |
120 |
} |
|
121 | 7x |
r |
122 |
} |
1 |
#' Download Latent Semantic Spaces |
|
2 |
#' |
|
3 |
#' Downloads the specified semantic space from \href{https://osf.io/489he}{osf.io/489he}. |
|
4 |
#' |
|
5 |
#' @param space Name of one or more spaces you want to download, or \code{'all'} for all available. |
|
6 |
#' \code{'100k_lsa'} is the default, and some other common options might be \code{'google'}, \code{'facebook'}, |
|
7 |
#' or \code{'glove'}. See \href{https://osf.io/489he/wiki/home}{osf.io/489he/wiki} for more information, |
|
8 |
#' and a full list of spaces. |
|
9 |
#' @param decompress Logical; if \code{TRUE} (default), decompresses the downloaded file |
|
10 |
#' with the \code{bunzip2} system command assuming it is available \cr (as indicated by |
|
11 |
#' \code{Sys.which('bunzip2')}). |
|
12 |
#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF, |
|
13 |
#' and compares it with that calculated from the downloaded file to check its integrity. |
|
14 |
#' @param mode A character specifying the file write mode; default is 'wb'. See |
|
15 |
#' \code{\link{download.file}}. |
|
16 |
#' @param dir Directory in which to save the space. Specify this here, or set the lspace directory option |
|
17 |
#' (e.g., \code{options(lingmatch.lspace.dir = '~/Latent Semantic Spaces')}), or use |
|
18 |
#' \code{\link{lma_initdirs}} to initialize a directory. |
|
19 |
#' @param overwrite Logical; if \code{TRUE}, will replace existing files. |
|
20 |
#' @family Latent Semantic Space functions |
|
21 |
#' @return A character vector with paths to the [1] data and [2] term files. |
|
22 |
#' @examples |
|
23 |
#' \dontrun{ |
|
24 |
#' |
|
25 |
#' download.lspace("glove_crawl", dir = "~/Latent Semantic Spaces") |
|
26 |
#' } |
|
27 |
#' @export |
|
28 |
#' @importFrom utils download.file |
|
29 |
#' @importFrom tools md5sum |
|
30 | ||
31 |
download.lspace <- function(space = "100k_lsa", decompress = TRUE, |
|
32 |
check.md5 = TRUE, mode = "wb", dir = getOption("lingmatch.lspace.dir"), |
|
33 |
overwrite = FALSE) { |
|
34 | 1x |
download.resource( |
35 | 1x |
"lspace", space, |
36 | 1x |
decompress = decompress, check.md5 = check.md5, mode = mode, dir = dir, overwrite = overwrite |
37 |
) |
|
38 |
} |
1 |
#include <Rcpp.h> |
|
2 |
using namespace std; |
|
3 |
using namespace Rcpp; |
|
4 | ||
5 |
/** |
|
6 |
* @brief Converts tokenized texts to indices, as part of document-term matrix |
|
7 |
* creation. |
|
8 |
* |
|
9 |
* @param tokens A list of character vectors; tokenized texts. |
|
10 |
* @param terms A character vector containing all terms. |
|
11 |
* @param isword A logical vector indicating whether each term is a word or. |
|
12 |
* punctuation mark (and so should count toward word-count). |
|
13 |
* @param dim Dimensions of the output matrix [n rows, n cols]. |
|
14 |
* @param complete Logical indicating whether to all terms tokens should be |
|
15 |
* counted, or only those in `terms` should count. |
|
16 |
* @param tokensonly Logical specifying the return type. |
|
17 |
* @return List; a term map with row indices (if `tokensonly`), or a sparse |
|
18 |
* matrix. |
|
19 |
*/ |
|
20 | ||
21 |
// [[Rcpp::export]] |
|
22 | 77x |
List match_terms(const List &tokens, const CharacterVector &terms, |
23 |
const LogicalVector &isword, const IntegerVector &dim, |
|
24 |
const bool &complete, const bool &tokensonly) { |
|
25 | 77x |
const int n = tokens.length(); |
26 | 77x |
int i = terms.length(), colindex, s = 0, un, p, ck = 1e3; |
27 | 308x |
vector<int> rows, columns, rowsums(n), colsums(i); |
28 | 77x |
vector<double> counts; |
29 | 77x |
CharacterVector uses; |
30 | 77x |
unordered_map<String, int> dict; |
31 | 15081x |
for (; i--;) dict.insert({terms[i], i}); |
32 | 77x |
if (tokensonly) { |
33 | 32x |
for (; s < n; s++) { |
34 | 30x |
uses = tokens[s]; |
35 | 1230x |
for (un = uses.length(), p = 0; p < un; p++) |
36 | 1200x |
if (complete || dict.find(uses[p]) != dict.end()) { |
37 | 1178x |
colindex = dict.at(uses[p]); |
38 | 1178x |
rows.push_back(colindex); |
39 | 1178x |
colsums[colindex]++; |
40 | 1178x |
rowsums[s]++; |
41 |
} |
|
42 | 30x |
if (!--ck) { |
43 | ! |
checkUserInterrupt(); |
44 | ! |
ck = 1e3; |
45 |
} |
|
46 |
} |
|
47 | 2x |
return List::create(dict, colsums, rowsums, rows); |
48 |
} else { |
|
49 | 1301x |
for (; s < n; s++) { |
50 | 1226x |
uses = tokens[s]; |
51 | 63612x |
for (un = uses.length(), p = 0; p < un; p++) |
52 | 62386x |
if (complete || dict.find(uses[p]) != dict.end()) { |
53 | 62355x |
colindex = dict.at(uses[p]); |
54 | 62355x |
colsums[colindex]++; |
55 | 62355x |
if (isword[colindex]) rowsums[s]++; |
56 | 62355x |
if (p == 0 || uses[p] != uses[p - 1]) { |
57 | 62021x |
rows.push_back(s); |
58 | 62021x |
columns.push_back(colindex); |
59 | 62021x |
counts.push_back(1); |
60 | 62021x |
i++; |
61 |
} else |
|
62 | 334x |
counts[i]++; |
63 |
} |
|
64 | 1226x |
if (!--ck) { |
65 | ! |
checkUserInterrupt(); |
66 | ! |
ck = 1e3; |
67 |
} |
|
68 |
} |
|
69 | 150x |
S4 dtm("dgTMatrix"); |
70 | 75x |
dtm.slot("Dim") = dim; |
71 | 225x |
dtm.slot("Dimnames") = List::create(R_NilValue, terms); |
72 | 150x |
dtm.slot("i") = rows; |
73 | 150x |
dtm.slot("j") = columns; |
74 | 75x |
dtm.slot("x") = counts; |
75 | 75x |
return List::create(dtm, rowsums, colsums); |
76 |
} |
|
77 |
} |
1 |
// [[Rcpp::depends(BH)]] |
|
2 |
#include <Rcpp.h> |
|
3 | ||
4 |
#include <fstream> |
|
5 |
#define BOOST_REGEX_USE_C_LOCALE |
|
6 |
#include <boost/regex.hpp> |
|
7 |
using namespace std; |
|
8 |
using namespace Rcpp; |
|
9 | ||
10 |
/** |
|
11 |
* @brief Standardizes embeddings. |
|
12 |
* |
|
13 |
* @param infile Path to original embeddings file. |
|
14 |
* @param outfile Base path to reformatted files, which will translate to a |
|
15 |
* `.dat` and `_terms.txt` file. |
|
16 |
* @param sep Separating character(s). |
|
17 |
* @param digits Number of digits to round to. |
|
18 |
* @param remove String to remove to any terms. |
|
19 |
* @param term_check Regex pattern used to check terms for inclusion. |
|
20 |
* @param verbose Logical indicating whither to print every thousandth line |
|
21 |
* during reformatting. |
|
22 |
* @return void; writes output files. |
|
23 |
*/ |
|
24 | ||
25 |
// [[Rcpp::export]] |
|
26 | 2x |
void reformat_embedding(const std::string &infile, const std::string &outfile, |
27 |
const char &sep = ' ', const int &digits = 9, |
|
28 |
const std::string &remove = "", |
|
29 |
const std::string &term_check = |
|
30 |
"^[a-zA-Z]+$|^['a-zA-Z][a-zA-Z.'\\/-]*[a-zA-Z.]$", |
|
31 |
const bool &verbose = false) { |
|
32 | 2x |
ifstream d(infile); |
33 | 2x |
ofstream o(outfile + ".dat"), cn(outfile + "_terms.txt"); |
34 | 2x |
o << setprecision(digits) << fixed; |
35 | 2x |
int n, i, ln, cl = 0, ck = 1e3; |
36 | 2x |
const string num = "-.0123456789"; |
37 | 2x |
bool start = true, filter = term_check != ""; |
38 | 2x |
boost::regex ckterm(term_check, boost::regex_constants::optimize), |
39 | 2x |
rm(remove, boost::regex_constants::optimize); |
40 | 2x |
std::string line, term, value; |
41 | 202x |
for (; getline(d, line);) { |
42 | 1420x |
for (cl++, term = "", n = line.length(), i = 0; i < n; i++) { |
43 | 1420x |
if (line[i] == sep) break; |
44 | 1220x |
term.push_back(line[i]); |
45 |
} |
|
46 |
if (remove != "") term = boost::regex_replace(term, rm, ""); |
|
47 | 400x |
if (term != "" && n > 100 && i++ < n && |
48 | 590x |
(!filter || boost::regex_match(term, ckterm)) && |
49 | 190x |
num.find(line[i]) != string::npos) |
50 |
try { |
|
51 | 190x |
ln = line.length(); |
52 | 190x |
if (ln) { |
53 | 190x |
cn << term << endl; |
54 | 1150524x |
for (start = true, value = ""; i < ln; i++) { |
55 | 1150334x |
if (line[i] == sep) { |
56 | 56810x |
if (value != "") { |
57 | 56810x |
if (start) { |
58 | 190x |
o << atof(value.c_str()); |
59 | 190x |
start = false; |
60 |
} else |
|
61 | 56620x |
o << ' ' << atof(value.c_str()); |
62 | 56810x |
value = ""; |
63 |
} |
|
64 |
} else |
|
65 | 1093524x |
value.push_back(line[i]); |
66 |
} |
|
67 |
value == "" ? o << endl : o << ' ' << atof(value.c_str()) << endl; |
|
68 |
} |
|
69 |
} catch (const std::exception &e) { |
|
70 |
Rcout << line << endl; |
|
71 |
} |
|
72 | 200x |
if (!--ck) { |
73 | ! |
checkUserInterrupt(); |
74 | ! |
if (verbose) Rcout << "line " << cl << ": " << term << endl; |
75 | ! |
ck = 1e3; |
76 |
} |
|
77 |
} |
|
78 |
} |
|
79 | ||
80 |
/** |
|
81 |
* @brief Read embeddings vectors based on term indices. |
|
82 |
* |
|
83 |
* @param indices Integer vector containing the indices of the vectors to load. |
|
84 |
* @param file File containing vectors. |
|
85 |
* @param sep Separator between values. |
|
86 |
* @return NumericVector; term by dimension matrix. |
|
87 |
*/ |
|
88 | ||
89 |
// [[Rcpp::export]] |
|
90 | 18x |
NumericVector extract_indices(const IntegerVector &indices, |
91 |
const std::string &file, const char &sep = ' ') { |
|
92 | 18x |
int nr = indices.length(), ck = 1e3, nc = 0, c = 0, t = 0, p = 0, ln, i; |
93 | 18x |
std::string line, value; |
94 | 18x |
ifstream d(file); |
95 | 49307x |
for (getline(d, line), t = p = line.length(), value = ""; p--;) { |
96 | 49289x |
if (line[p] == sep) { |
97 | 5182x |
if (p != t - 1) nc++; |
98 | 44107x |
} else if (!nc) |
99 | 155x |
value.insert(value.begin(), line[p]); |
100 |
} |
|
101 |
nc == 1 ? nc = atoi(value.c_str()) : nc++; |
|
102 | 18x |
IntegerVector dims = {nc, nr}; |
103 | 18x |
NumericVector r(nc * nr); |
104 | 18x |
if (indices[0] == 1) { |
105 | 8x |
d.seekg(0, d.beg); |
106 | 8x |
p = 0; |
107 |
} else |
|
108 | 10x |
p = 1; |
109 | 1238496x |
for (t = 0; t < nr;) { |
110 | 1238478x |
if (++p == indices[t]) { |
111 | 310764x |
getline(d, line); |
112 | 363189637x |
for (value = "", ln = line.length(), c = 0, i = 0; i < ln; i++) { |
113 | 362878873x |
if (line[i] == sep) { |
114 | 37437236x |
r[t * nc + c++] = atof(value.c_str()); |
115 | 37437236x |
value = ""; |
116 |
} else |
|
117 | 325441637x |
value.push_back(line[i]); |
118 |
} |
|
119 | 310764x |
r[t++ * nc + c] = atof(value.c_str()); |
120 |
} else |
|
121 | 927714x |
d.ignore(numeric_limits<streamsize>::max(), '\n'); |
122 | 1238478x |
if (!--ck) { |
123 | 1231x |
checkUserInterrupt(); |
124 | 1231x |
ck = 1e3; |
125 |
} |
|
126 |
} |
|
127 | 18x |
d.close(); |
128 | 18x |
r.attr("dim") = dims; |
129 | 36x |
return r; |
130 |
} |
1 |
// [[Rcpp::depends(RcppParallel)]] |
|
2 |
#include <Rcpp.h> |
|
3 |
#include <RcppParallel.h> |
|
4 |
using namespace std; |
|
5 |
using namespace Rcpp; |
|
6 |
using namespace RcppParallel; |
|
7 | ||
8 |
/** |
|
9 |
* @brief Calculate similarity between two vectors. |
|
10 |
* |
|
11 |
* @param a First numeric vector. |
|
12 |
* @param b Second numeric vector. |
|
13 |
* @param metrics Binary integer vector indicating which metrics to calculate |
|
14 |
* ([Jaccard, Euclidean, Canberra, Cosine, Pearson's]). |
|
15 |
* @return NumericVector; each requested similarity value. |
|
16 |
*/ |
|
17 | ||
18 |
// [[Rcpp::export]] |
|
19 | 428x |
NumericVector vector_similarity(NumericVector &a, NumericVector &b, |
20 |
const IntegerVector &metrics) { |
|
21 | 428x |
const int n = a.length(); |
22 | 428x |
bool ck = true; |
23 | 428x |
NumericVector op; |
24 | 428x |
LogicalVector su = is_na(a) | is_na(b) | ((a == 0) & (b == 0)); |
25 | 428x |
if (is_true(all(su))) |
26 | ! |
ck = false; |
27 | 428x |
else if (is_true(any(su))) { |
28 | 145x |
a = a[!su]; |
29 | 145x |
b = b[!su]; |
30 |
} |
|
31 | 428x |
if (metrics[0]) { |
32 |
op.push_back(ck ? (float)sum((a != 0) & (b != 0)) / sum((a != 0) | (b != 0)) |
|
33 |
: NA_REAL, |
|
34 |
"jaccard"); |
|
35 |
} |
|
36 | 428x |
if (metrics[1]) |
37 |
op.push_back(ck ? 1 / (1 + sqrt(sum(pow(a - b, 2)))) : 1, "euclidean"); |
|
38 | 428x |
if (metrics[2]) |
39 |
op.push_back(ck ? 1 - sum(abs(a - b) / (abs(a) + abs(b))) / n : NA_REAL, |
|
40 |
"canberra"); |
|
41 | 428x |
if (metrics[3]) { |
42 | 606x |
op.push_back( |
43 |
ck ? sum(a * b) / sqrt(sum(pow(a, 2) * sum(pow(b, 2)))) : NA_REAL, |
|
44 |
"cosine"); |
|
45 | 610x |
if (isnan((float)op["cosine"])) op["cosine"] = NA_REAL; |
46 |
} |
|
47 | 428x |
if (metrics[4]) { |
48 | 27x |
const double ma = sum(a) / n, mb = sum(b) / n; |
49 |
op.push_back(ck ? (sum(a * b) / n - (ma * mb)) / |
|
50 | 27x |
sqrt(sum(pow(a, 2)) / n - pow(ma, 2)) / |
51 | 54x |
sqrt(sum(pow(b, 2)) / n - pow(mb, 2)) |
52 |
: NA_REAL, |
|
53 |
"pearson"); |
|
54 | 58x |
if (isnan((float)op["pearson"])) op["pearson"] = NA_REAL; |
55 |
} |
|
56 | 856x |
return op; |
57 |
} |
|
58 | ||
59 |
class Sparse_Arrays { |
|
60 |
public: |
|
61 |
const NumericVector values; |
|
62 |
const IntegerVector dims, i, p; |
|
63 |
const int n = 0, ncol = 0; |
|
64 |
IntegerVector row_starts, row_maps, columns; |
|
65 | 385x |
Sparse_Arrays(const S4 &m) |
66 | 1155x |
: values(m.slot("x")), |
67 | 770x |
dims(m.slot("Dim")), |
68 | 770x |
i(m.slot("i")), |
69 | 385x |
p(m.slot("p")), |
70 | 385x |
n(values.length()), |
71 | 385x |
ncol(dims[1]), |
72 | 385x |
row_starts(IntegerVector(dims[0], -1)), |
73 | 385x |
row_maps(IntegerVector(n, -1)), |
74 | 385x |
columns(IntegerVector(n)) { |
75 | 385x |
int index = 0, r = 0, c = p[0]; |
76 | 385x |
unordered_map<int, int> row_steps; |
77 | 132229890x |
for (; index < n; index++) { |
78 | 132229505x |
r = i[index]; |
79 | 132229505x |
if (row_starts[r] == -1) { |
80 | 1040651x |
row_starts[r] = index; |
81 | 1040651x |
row_steps.insert({r, index}); |
82 |
} else { |
|
83 | 131188854x |
row_maps[row_steps.at(r)] = index; |
84 | 131188854x |
row_steps.at(r) = index; |
85 |
} |
|
86 | 132229505x |
if (c + 1 < ncol && index == p[c + 1]) { |
87 | 49383x |
c++; |
88 | 55472x |
while (c + 1 < ncol && p[c] == p[c + 1]) c++; |
89 |
} |
|
90 | 132229505x |
columns[index] = c; |
91 |
} |
|
92 |
} |
|
93 |
}; |
|
94 | ||
95 |
struct Compare : public Worker { |
|
96 |
const Sparse_Arrays a, b; |
|
97 |
const int n, ncol, type, nmetrics = 5; |
|
98 |
RVector<int> aind, bind, metrics; |
|
99 |
RVector<double> jaccard, euclidean, canberra, cosine, pearson; |
|
100 | 231x |
Compare(const Sparse_Arrays &a, const Sparse_Arrays &b, |
101 |
const IntegerVector &i, const IntegerVector &j, const int &type, |
|
102 |
const IntegerVector &metrics, |
|
103 |
unordered_map<String, NumericVector> &out) |
|
104 | 462x |
: a(a), |
105 | 231x |
b(b), |
106 | 231x |
n(a.n), |
107 | 231x |
ncol(a.ncol), |
108 | 231x |
type(type), |
109 | 231x |
aind(i), |
110 | 231x |
bind(j), |
111 | 231x |
metrics(metrics), |
112 | 231x |
jaccard(out.at("jaccard")), |
113 | 231x |
euclidean(out.at("euclidean")), |
114 | 231x |
canberra(out.at("canberra")), |
115 | 231x |
cosine(out.at("cosine")), |
116 | 693x |
pearson(out.at("pearson")) {} |
117 | 137626x |
void operator()(size_t p, size_t final) { |
118 |
int r, comp, i, c, bi, bc, l, col; |
|
119 |
double x, bx, dif, sa, sb, sdif, sadif, sse, sne, cp, asq, bsq, sj, si, ma, |
|
120 |
mb; |
|
121 | 2936314x |
for (; p < final; p++) { |
122 | 2799259x |
r = aind[p]; |
123 | 2795707x |
comp = bind[p]; |
124 | 2796901x |
i = a.row_starts[r]; |
125 | 2779528x |
c = i == -1 ? -1 : a.columns[i]; |
126 | 2774633x |
bi = b.row_starts[comp]; |
127 | 2771176x |
bc = bi == -1 ? -1 : b.columns[bi]; |
128 | 2767589x |
x = bx = sa = sb = sdif = sadif = sse = sne = cp = asq = bsq = sj = si = |
129 |
0; |
|
130 | 234284358x |
for (l = ncol; l--;) { |
131 | 263110273x |
col = c; |
132 | 263110273x |
if (i == -1 || (bc != -1 && c > bc)) { |
133 | 754610x |
x = 0; |
134 | 754610x |
if (i == -1) c = -1; |
135 |
} else { |
|
136 | 262355663x |
x = a.values[i]; |
137 | 163406580x |
i = a.row_maps[i]; |
138 | 135819982x |
c = i == -1 ? -1 : a.columns[i]; |
139 |
if (NumericVector::is_na(x)) continue; |
|
140 |
} |
|
141 | 118446667x |
if (bi == -1 || (col != -1 && bc > col)) { |
142 | 578863x |
bx = 0; |
143 | 578863x |
if (bi == -1) bc = -1; |
144 |
} else { |
|
145 | 117867804x |
bx = b.values[bi]; |
146 | 123909872x |
bi = b.row_maps[bi]; |
147 | 120818819x |
bc = bi == -1 ? -1 : b.columns[bi]; |
148 | 108430728x |
if (NumericVector::is_na(bx) || bx + x == 0) continue; |
149 |
} |
|
150 | 116228658x |
sa += x; |
151 | 116228658x |
sb += bx; |
152 | 116228658x |
dif = x - bx; |
153 | 116228658x |
sse += pow(dif, 2); |
154 | 212765449x |
dif = abs(dif); |
155 | 226389827x |
sdif += dif; |
156 | 226389827x |
if (x || bx) sne += dif / (abs(x) + abs(bx)); |
157 | 222475667x |
cp += x * bx; |
158 | 222475667x |
asq += pow(x, 2); |
159 | 223499606x |
bsq += pow(bx, 2); |
160 | 234315438x |
sj += x && bx; |
161 | 234315438x |
si += x || bx; |
162 | 234315438x |
if (i == -1 && bi == -1) break; |
163 |
} |
|
164 | ! |
ma = sa / ncol; |
165 | ! |
mb = sb / ncol; |
166 | ! |
if (si) { |
167 | 2802775x |
if (metrics[0]) jaccard[p] = sj / si; |
168 | 2796600x |
if (metrics[2]) canberra[p] = 1 - sne / ncol; |
169 |
} |
|
170 | ! |
if (metrics[1]) euclidean[p] = 1 / (1 + sqrt(sse)); |
171 | 2797431x |
if (sa && sb) { |
172 | 2797713x |
if (metrics[3] && asq && bsq) cosine[p] = cp / sqrt(asq) / sqrt(bsq); |
173 | 2800478x |
if (metrics[4]) { |
174 | 20398x |
x = (cp / ncol - ma * mb) / sqrt(asq / ncol - pow(ma, 2)) / |
175 | 19960x |
sqrt(bsq / ncol - pow(mb, 2)); |
176 |
pearson[p] = !isnan(x) ? x : NA_REAL; |
|
177 |
} |
|
178 |
} |
|
179 |
} |
|
180 |
} |
|
181 |
}; |
|
182 | ||
183 |
/** |
|
184 |
* @brief Calculate similarities between vectors within two matrices. |
|
185 |
* |
|
186 |
* @param m Primary sparse matrix. |
|
187 |
* @param comp The second sparse matrix, or NULL if pairwise comparisons within |
|
188 |
* `m` are to be made. |
|
189 |
* @param type Integer indicating which type of comparison to make: 1 = `m` row |
|
190 |
* to specific `comp` row, 2 = all `m` rows to `m` rows, 3 = all `m` rows to all |
|
191 |
* `comp` rows. |
|
192 |
* @param metrics Binary integer vector indicating which metrics to calculate. |
|
193 |
* @return List; a sparse matrix containing similarities for each requested |
|
194 |
* metric. |
|
195 |
*/ |
|
196 | ||
197 |
// [[Rcpp::export]] |
|
198 | 231x |
List calculate_similarities(const S4 &m, const RObject &comp, int &type, |
199 |
const IntegerVector &metrics) { |
|
200 | 231x |
const bool procb = type != 2 && comp.isS4(); |
201 | 231x |
Sparse_Arrays a(m), b = procb ? Sparse_Arrays(as<S4>(comp)) : a; |
202 | 231x |
const IntegerVector dim{a.dims[0], b.dims[0]}; |
203 | 231x |
size_t nrow = b.dims[0], ai = 1, bi = 0, index = metrics.length(); |
204 | 357x |
const size_t arow = a.dims[0], n = type == 1 ? a.dims[0] |
205 | 126x |
: type == 2 ? arow * (arow - 1) / 2 |
206 | 231x |
: arow * nrow; |
207 |
// setting up output vectors |
|
208 |
const CharacterVector metric_names{"jaccard", "euclidean", "canberra", |
|
209 | 231x |
"cosine", "pearson"}; |
210 | 231x |
unordered_map<String, NumericVector> res; |
211 | 1386x |
for (; index--;) |
212 | 1155x |
res.insert({metric_names[index], NumericVector(metrics[index] ? n : 0)}); |
213 |
// mapping out row-row comparisons |
|
214 | 231x |
IntegerVector aind(n), bind(n), column_starts(nrow + 1); |
215 | 231x |
column_starts[nrow] = n; |
216 | 231x |
switch (type) { |
217 | 105x |
case 1: // each a row to a single b row or paired b rows |
218 | 105x |
aind = seq_len(n) - 1; |
219 | 105x |
if (nrow == n) bind = aind; |
220 | 105x |
break; |
221 | 77x |
case 2: // pairwise between a rows |
222 | 60002x |
for (index = 0; index < n; index++) { |
223 | 59925x |
aind[index] = ai; |
224 | 59925x |
bind[index] = bi; |
225 | 59925x |
if (++ai == nrow) { |
226 | 2142x |
column_starts[++bi] = index + 1; |
227 | 2142x |
ai = bi + 1; |
228 |
} |
|
229 |
} |
|
230 | 77x |
break; |
231 | 49x |
case 3: // each a row with each b row |
232 | 49x |
ai = 0, nrow = a.dims[0], index = 0; |
233 | 2343945x |
for (; index < n; index++) { |
234 | 2343896x |
aind[index] = ai; |
235 | 2343896x |
bind[index] = bi; |
236 | 2343896x |
if (++ai == nrow) { |
237 | 447x |
column_starts[++bi] = index + 1; |
238 | 447x |
ai = 0; |
239 |
} |
|
240 |
} |
|
241 | 49x |
break; |
242 |
} |
|
243 |
// making comparisons |
|
244 | 462x |
Compare sims(a, b, aind, bind, type, metrics, res); |
245 | 231x |
checkUserInterrupt(); |
246 | 231x |
parallelFor(0, n, sims); |
247 |
// formatting output |
|
248 | 231x |
if (a.dims[0] == 1) type = 1; |
249 | 231x |
List op; |
250 | 462x |
NumericVector sim; |
251 | 1386x |
for (const String met : metric_names) { |
252 | 1155x |
sim = res.at(met); |
253 | 1155x |
if (sim.length()) { |
254 | 307x |
if (type == 1) { |
255 | 176x |
op.push_back(sim, met); |
256 |
} else { |
|
257 | 262x |
S4 simsm(type == 2 ? "dtCMatrix" : "dgCMatrix"); |
258 | 262x |
simsm.slot("Dim") = dim; |
259 | 131x |
List dimnames(m.slot("Dimnames")), |
260 |
newdimnames{dimnames[0], |
|
261 | 131x |
dim[1] == dim[0] ? dimnames[0] : R_NilValue}; |
262 | 131x |
if (procb) { |
263 | 46x |
List dimnames2(comp.slot("Dimnames")); |
264 | 46x |
newdimnames[1] = dimnames2[0]; |
265 |
} |
|
266 | 262x |
simsm.slot("Dimnames") = newdimnames; |
267 | 262x |
simsm.slot("i") = aind; |
268 | 262x |
simsm.slot("p") = column_starts; |
269 | 131x |
simsm.slot("x") = sim; |
270 | 131x |
if (type == 2) { |
271 | 170x |
simsm.slot("uplo") = "L"; |
272 | 170x |
simsm.slot("diag") = "U"; |
273 |
} |
|
274 | 131x |
op.push_back(simsm, met); |
275 |
} |
|
276 |
} |
|
277 |
} |
|
278 | 462x |
return op; |
279 |
} |
1 |
// [[Rcpp::depends(BH)]] |
|
2 |
#include <Rcpp.h> |
|
3 | ||
4 |
#include <fstream> |
|
5 |
#define BOOST_REGEX_USE_C_LOCALE |
|
6 |
#include <boost/regex.hpp> |
|
7 |
using namespace std; |
|
8 |
using namespace Rcpp; |
|
9 | ||
10 |
/** |
|
11 |
* @brief Count pattern matches within text. |
|
12 |
* |
|
13 |
* @param texts Text to search within. |
|
14 |
* @param patterns Patterns to search for. |
|
15 |
* @param terms Integer vector containing term indices. |
|
16 |
* @param fixed Logical indicating whether patterns are regular expressions or |
|
17 |
* fixed. |
|
18 |
* @param exclusive Logical indicating whether matches should be removed on |
|
19 |
* match (avoiding sub-matches). |
|
20 |
* @return List; a sparse document-term matrix. |
|
21 |
*/ |
|
22 | ||
23 |
// [[Rcpp::export]] |
|
24 | 70x |
List pattern_search(const CharacterVector &texts, |
25 |
const CharacterVector &patterns, const IntegerVector &terms, |
|
26 |
const bool &fixed, const bool &exclusive) { |
|
27 | 70x |
int cx, r, l, i = 0, n = patterns.length(), tn = texts.length(); |
28 |
unsigned int p, tp; |
|
29 | 70x |
std::string txt; |
30 | 70x |
IntegerVector dim{tn, (int)patterns.length()}, rows, columns, rowsums(dim[0]); |
31 | 70x |
NumericVector values; |
32 | 70x |
if (fixed) { |
33 | 259x |
for (r = 0; r < tn; r++) { |
34 | 205x |
checkUserInterrupt(); |
35 | 205x |
txt = texts[r]; |
36 | 16827x |
for (i = 0, n = patterns.length(); i < n; i++) { |
37 | 16622x |
l = patterns[i].size(), cx = 0; |
38 | 16622x |
if (l) |
39 | 16622x |
for (std::string::size_type r, p = 0; |
40 | 24241x |
(r = txt.find(patterns[i], p)) != std::string::npos;) { |
41 | 7619x |
if (exclusive) { |
42 | 7581x |
txt.replace(r, l, " "); |
43 | 7581x |
p = r + 1; |
44 |
} else |
|
45 | 38x |
p = r + l; |
46 | 7619x |
cx += 1; |
47 |
} |
|
48 | 16622x |
if (cx) { |
49 | 3860x |
rows.push_back(r); |
50 | 3860x |
columns.push_back(terms[i]); |
51 | 3860x |
values.push_back(cx); |
52 | 3860x |
rowsums[r] += cx; |
53 |
} |
|
54 |
} |
|
55 |
} |
|
56 |
} else { |
|
57 | 16x |
vector<boost::regex> pats; |
58 | 16x |
if (!fixed) |
59 | 3131x |
for (; i < n; i++) |
60 | 3115x |
pats.push_back( |
61 | 6230x |
boost::regex(patterns[i], boost::regex_constants::optimize)); |
62 | 16x |
const unsigned int zero = 0; |
63 | 51x |
for (r = zero; r < tn; r++) { |
64 | 35x |
checkUserInterrupt(); |
65 | 35x |
txt = texts[r]; |
66 | 35x |
boost::smatch re; |
67 | 5140x |
for (i = zero, n = pats.size(); i < n; i++) { |
68 | 5105x |
std::string::const_iterator start = txt.cbegin(); |
69 | 5105x |
cx = zero; |
70 | 5105x |
if (patterns[i].size()) |
71 | 5105x |
for (tp = zero; |
72 | 6251x |
boost::regex_search(start, txt.cend(), re, pats[i]);) { |
73 | 1146x |
p = re.position(zero); |
74 | 1146x |
if (exclusive) { |
75 | 1099x |
txt.replace(tp + p, re.length(), " "); |
76 | 1099x |
start += p + 1; |
77 | 1099x |
tp += p + 1; |
78 |
} else |
|
79 | 47x |
start += p + re.length(); |
80 | 1146x |
cx += 1; |
81 |
} |
|
82 | 5105x |
if (cx) { |
83 | 536x |
rows.push_back(r); |
84 | 536x |
columns.push_back(terms[i]); |
85 | 536x |
values.push_back(cx); |
86 | 536x |
rowsums[r] += cx; |
87 |
} |
|
88 |
} |
|
89 |
} |
|
90 |
} |
|
91 | 140x |
S4 dtm("dgTMatrix"); |
92 | 140x |
dtm.slot("Dim") = dim; |
93 | 140x |
dtm.slot("i") = rows; |
94 | 140x |
dtm.slot("j") = columns; |
95 | 70x |
dtm.slot("x") = values; |
96 | 140x |
return List::create(dtm, rowsums); |
97 |
} |
|
98 | ||
99 |
/** |
|
100 |
* @brief Extract matches to fuzzy terms from text. |
|
101 |
* |
|
102 |
* @param terms Character vector containing regular expression patterns. |
|
103 |
* @param text Text to extract term matches from. |
|
104 |
* @param raw Logical indicating whether `text` is raw text or collapsed terms. |
|
105 |
* @return List; a table of matches and their counts for each term. |
|
106 |
*/ |
|
107 | ||
108 |
// [[Rcpp::export]] |
|
109 | 13x |
List extract_matches(const CharacterVector &terms, |
110 |
const std::vector<std::string> &text, const bool &raw) { |
|
111 | 13x |
const int n = terms.length(); |
112 | 13x |
List res(n); |
113 | 13x |
string match; |
114 | 113x |
for (int i = 0, ck = 1e3; i < n; i++) { |
115 | 100x |
unordered_map<std::string, int> matches; |
116 | 100x |
const boost::regex re(terms[i]); |
117 | 330x |
for (const std::string &t : text) { |
118 | 230x |
if (raw) { |
119 | 197x |
boost::sregex_iterator search(t.cbegin(), t.cend(), re), search_end; |
120 | 1597x |
for (; search != search_end; search++) { |
121 | 1400x |
match = (*search).str(); |
122 | 1400x |
if (matches.find(match) == matches.end()) { |
123 | 1320x |
matches.insert({match, 1}); |
124 |
} else |
|
125 | 80x |
matches[match]++; |
126 |
} |
|
127 |
} else { |
|
128 | 33x |
if (boost::regex_match(t, re)) { |
129 | 3x |
if (matches.find(t) == matches.end()) { |
130 | 3x |
matches.insert({t, 1}); |
131 |
} else |
|
132 | ! |
matches[t]++; |
133 |
} |
|
134 |
} |
|
135 | 230x |
if (!--ck) { |
136 | ! |
checkUserInterrupt(); |
137 | ! |
ck = 1e3; |
138 |
} |
|
139 |
} |
|
140 | 100x |
res[i] = matches; |
141 |
} |
|
142 | 26x |
return res; |
143 |
} |