lingmatch coverage - 88.81%

Files
Source

#' Document-Term Matrix Categorization
#'
#' Reduces the dimensions of a document-term matrix by dictionary-based categorization.
#' @param dtm A matrix with terms as column names.
#' @param dict The name of a provided dictionary
#'   (\href{https://osf.io/y6g5b/wiki/home}{osf.io/y6g5b/wiki}) or of a file found in
#'   \code{dir}, or a \code{list} object with named character vectors as word lists,
#'   or the path to a file to be read in by \code{\link{read.dic}}.
#' @param term.weights A \code{list} object with named numeric vectors lining up with the character
#'   vectors in \code{dict}, used to weight the terms in each \code{dict} vector. If a category in
#'   \code{dict} is not specified in \code{term.weights}, or the \code{dict} and \code{term.weights}
#'   vectors aren't the same length, the weights for that category will be 1.
#' @param bias A list or named vector specifying a constant to add to the named category. If a term
#'   matching \code{bias.name} is included in a category, it's associated \code{weight} will be used
#'   as the \code{bias} for that category.
#' @param bias.name A character specifying a term to be used as a category bias; default is
#'   \code{'_intercept'}.
#' @param escape Logical indicating whether the terms in \code{dict} should not be treated as plain
#'   text (including asterisk wild cards). If \code{TRUE}, regular expression related characters are
#'   escaped. Set to \code{TRUE} if you get PCRE compilation errors.
#' @param partial Logical; if \code{TRUE} terms are partially matched (not padded by ^ and $).
#' @param glob Logical; if \code{TRUE} (default), will convert initial and terminal asterisks to
#'   partial matches.
#' @param to.lower Logical; if \code{TRUE} will lowercase dictionary terms. Otherwise, dictionary
#'   terms will be converted to match the terms if they are single-cased. Set to \code{FALSE} to
#'   always keep dictionary terms as entered.
#' @param term.filter A regular expression string used to format the text of each term (passed to
#'   \code{gsub}). For example, if terms are part-of-speech tagged (e.g.,
#'   \code{'a_DT'}), \code{'_.*'} would remove the tag.
#' @param term.break If a category has more than \code{term.break} characters, it will be processed
#'   in chunks. Reduce from 20000 if you get a PCRE compilation error.
#' @param dir Path to a folder in which to look for \code{dict}; \cr
#'   will look in \code{'~/Dictionaries'} by default. \cr
#'   Set a session default with \code{options(lingmatch.dict.dir = 'desired/path')}.
#' @param coverage Logical; if \code{TRUE}, will calculate coverage
#'   (number of unique term matches) for each category.
#' @seealso For applying pattern-based dictionaries (to raw text) see \code{\link{lma_patcat}()}.
#' @family Dictionary functions
#' @return A matrix with a row per \code{dtm} row and columns per dictionary category
#' (with added \code{coverage_} versions if \code{coverage} is \code{TRUE}),
#' and a \code{WC} attribute with original word counts.
#' @examples
#' dict <- list(category = c("cat", "dog", "pet*"))
#' lma_termcat(c(
#'   "cat, cat, cat, cat, cat, cat, cat, cat",
#'   "a cat, dog, or anything petlike, really",
#'   "petite petrochemical petitioned petty peter for petrified petunia petals"
#' ), dict, coverage = TRUE)
#'
#' \dontrun{
#'
#' # Score texts with the NRC Affect Intensity Lexicon
#'
#' dict <- readLines("https://saifmohammad.com/WebDocs/NRC-AffectIntensity-Lexicon.txt")
#' dict <- read.table(
#'   text = dict[-seq_len(grep("term\tscore", dict, fixed = TRUE)[[1]])],
#'   col.names = c("term", "weight", "category")
#' )
#'
#' text <- c(
#'   angry = paste(
#'     "We are outraged by their hateful brutality,",
#'     "and by the way they terrorize us with their hatred."
#'   ),
#'   fearful = paste(
#'     "The horrific torture of that terrorist was tantamount",
#'     "to the terrorism of terrorists."
#'   ),
#'   joyous = "I am jubilant to be celebrating the bliss of this happiest happiness.",
#'   sad = paste(
#'     "They are nearly suicidal in their mourning after",
#'     "the tragic and heartbreaking holocaust."
#'   )
#' )
#'
#' emotion_scores <- lma_termcat(text, dict)
#' if (require("splot")) splot(emotion_scores ~ names(text), leg = "out")
#'
#' ## or use the standardized version (which includes more categories)
#'
#' emotion_scores <- lma_termcat(text, "nrc_eil", dir = "~/Dictionaries")
#' emotion_scores <- emotion_scores[, c("anger", "fear", "joy", "sadness")]
#' if (require("splot")) splot(emotion_scores ~ names(text), leg = "out")
#' }
#' @export

lma_termcat <- function(dtm, dict, term.weights = NULL, bias = NULL, bias.name = "_intercept",
                        escape = TRUE, partial = FALSE, glob = TRUE, term.filter = NULL, term.break = 2e4,
                        to.lower = FALSE, dir = getOption("lingmatch.dict.dir"), coverage = FALSE) {
  st <- proc.time()[[3]]
  if (ckd <- dir == "") dir <- "~/Dictionaries"
  if (missing(dict)) dict <- lma_dict(1:9)
  if (is.factor(dict)) dict <- as.character(dict)
  if (is.character(dict) && length(dict) == 1 && missing(term.weights) && (file.exists(dict) || !grepl("\\s", dict))) {
    if (!file.exists(dict) && any(file.exists(normalizePath(paste0(dir, "/", dict), "/", FALSE)))) {
      dict <- normalizePath(paste0(dir, "/", dict))
    }
    td <- tryCatch(read.dic(dict, dir = if (ckd) "" else dir), error = function(e) NULL)
    dict <- if (is.null(td)) list(cat1 = dict) else td
  }
  if (!is.null(dim(dict))) {
    if (!is.null(term.weights)) {
      if (is.character(term.weights) && any(su <- term.weights %in% colnames(dict))) {
        term.weights <- dict[, term.weights[su], drop = FALSE]
      }
      if (!is.null(dim(term.weights))) {
        term.weights <- term.weights[, vapply(
          seq_len(ncol(term.weights)),
          function(col) is.numeric(term.weights[, col]), TRUE
        )]
      }
    } else if (any(su <- vapply(seq_len(ncol(dict)), function(col) is.numeric(dict[, col]), TRUE))) {
      term.weights <- dict[, su, drop = FALSE]
      dict <- if (all(su)) {
        if (!is.null(rownames(dict))) {
          data.frame(term = rownames(dict), stringsAsFactors = FALSE)
        } else {
          term.weights <- if (ncol(term.weights) == 1) NULL else term.weights[, -1, drop = FALSE]
          dict[, 1, drop = FALSE]
        }
      } else {
        dict[, !su, drop = FALSE]
      }
    }
    if (!is.null(rownames(dict)) && ncol(dict) == 1 && any(grepl("^[a-z]", rownames(dict), TRUE))) {
      dict <- rownames(dict)
    } else {
      su <- vapply(seq_len(ncol(dict)), function(col) !is.numeric(dict[, col]), TRUE)
      if (!any(su)) stop("no terms found in dictionary")
      dict <- if (sum(su) > 1) {
        su <- which(su)
        if (!is.null(term.weights) && (!is.list(term.weights) || ncol(term.weights) == 1)) {
          if (is.list(term.weights)) term.weights <- term.weights[, 1]
          ssu <- vapply(su, function(col) length(unique(dict[, col])), 0) + seq(length(su), 1)
          term.weights <- split(term.weights, dict[, which.min(ssu)])
          dict <- split(dict[, which.max(ssu)], dict[, which.min(ssu)])
        } else {
          ssu <- vapply(su, function(col) anyDuplicated(dict[, col]) == 0, TRUE)
          if (any(ssu)) dict[, su[ssu][1]] else dict[, su[1]]
        }
      } else {
        dict[, su]
      }
    }
  }
  if (is.numeric(dict) && is.null(term.weights)) {
    term.weights <- dict
    dict <- names(dict)
  }
  if (is.factor(dict)) dict <- as.character(dict)
  if (!is.null(dim(term.weights))) {
    if (is.null(colnames(term.weights))) colnames(term.weights) <- if (length(dict) == length(term.weights)) names(dict) else paste0("cat", seq_len(ncol(term.weights)))
    if (!is.data.frame(term.weights)) term.weights <- as.data.frame(term.weights, stringsAsFactors = FALSE)
    su <- vapply(term.weights, is.numeric, TRUE)
    if (any(!su)) {
      if (any(ssu <- !su & vapply(term.weights, anyDuplicated, 0) == 0)) {
        rownames(term.weights) <- term.weights[, which(ssu)[1]]
      }
      term.weights <- term.weights[, su]
    }
    if (!length(term.weights)) stop("no numeric columns in term.weights")
  }
  if (!is.list(dict)) {
    dict <- if (is.matrix(dict)) {
      as.data.frame(dict, stringsAsFactors = FALSE)
    } else if (is.character(dict) && length(dict) == 1 && (file.exists(dict) || dict %in% rownames(select.dict()$info))) {
      read.dic(dict, dir = if (ckd) "" else dir)
    } else {
      list(dict)
    }
  }
  if (is.list(dict)) {
    if (is.null(names(dict))) {
      tn <- if (!is.null(colnames(term.weights))) colnames(term.weights) else names(term.weights)
      names(dict) <- if (!is.null(tn) && length(tn) == length(dict)) tn else paste0("cat", seq_along(dict))
    } else if (any(su <- names(dict) == "")) {
      names(dict)[su] <- if (sum(su) == 1) "cat_unnamed" else paste0("cat_unnamed", seq_len(sum(su)))
      if (!is.null(term.weights) && any(su <- names(term.weights) == "")) {
        names(term.weights)[su] <- if (sum(su) == 1) "cat_unnamed" else paste0("cat_unnamed", seq_len(sum(su)))
      }
    }
  } else {
    dict <- list(dict)
  }
  if (!is.null(term.weights)) {
    if (is.null(dim(term.weights))) {
      if (is.list(term.weights)) {
        if (length(dict) != length(term.weights) && !is.null(names(term.weights[[1]]))) dict <- term.weights
        if (length(dict) == length(term.weights) && !all(names(dict) %in% names(term.weights))) {
          if (is.null(names(term.weights)) || !any(names(term.weights) %in% names(dict))) {
            names(term.weights) <- names(dict)
          } else {
            for (cat in names(dict)[!names(dict) %in% names(term.weights)]) {
              term.weights[cat] <- structure(rep(1, length(dict[[cat]])), names = dict[[cat]])
            }
          }
        }
        for (cat in names(dict)) {
          if (is.null(names(term.weights[[cat]]))) {
            if (length(term.weights[[cat]]) == length(dict[[cat]])) {
              names(term.weights[[cat]]) <- dict[[cat]]
            } else {
              term.weights[[cat]] <- structure(rep(1, length(dict[[cat]])), names = dict[[cat]])
            }
          }
        }
      } else {
        if (is.null(names(term.weights))) {
          if (length(dict[[1]]) == length(term.weights)) {
            term.weights <- list(term.weights)
            names(term.weights) <- names(dict)
            names(term.weights[[1]]) <- dict[[1]]
          } else {
            term.weights <- NULL
            warning("term.weights were dropped as they could not be aligned with dict")
          }
        }
      }
    } else {
      if (length(dict) == 1 && length(dict[[1]]) == nrow(term.weights) &&
        !any(grepl("[a-z]", rownames(term.weights), TRUE))) {
        if (is.factor(dict[[1]])) dict[[1]] <- as.character(dict[[1]])
        if (anyDuplicated(dict[[1]])) {
          dt <- unique(dict[[1]][duplicated(dict[[1]])])
          su <- dict[[1]] %in% dt
          td <- term.weights[su, ]
          tw <- matrix(0, length(dt), ncol(term.weights), dimnames = list(dt, colnames(term.weights)))
          for (term in dt) tw[term, ] <- colMeans(term.weights[dict[[1]] == term, , drop = FALSE], na.rm = TRUE)
          term.weights <- rbind(term.weights[!su, ], tw)
          rownames(term.weights) <- c(dict[[1]][!su], dt)
          dict[[1]] <- rownames(term.weights)
        } else {
          rownames(term.weights) <- dict[[1]]
        }
      }
    }
    if (!is.null(term.weights)) {
      if (!is.list(term.weights)) term.weights <- list(term.weights)
      dlen <- length(dict)
      if (is.null(names(term.weights))) {
        names(term.weights) <- if (length(term.weights) == dlen) names(dict) else seq_along(term.weights)
      }
      if (length(term.weights) > dlen && dlen == 1 && all(vapply(term.weights, length, 0) == length(dict[[1]]))) {
        dict <- lapply(term.weights, function(ws) dict[[1]])
      }
    }
  }
  dict <- lapply(dict, function(cat) {
    if (!is.character(cat)) {
      if (is.null(names(cat))) as.character(cat) else names(cat)
    } else {
      cat
    }
  })
  if (!is.null(bias) && is.null(names(bias))) {
    names(bias) <- if (length(bias) == length(dict)) names(dict) else seq_along(bias)
  }
  if (!is.null(names(term.weights)) && length(names(term.weights)) == length(dict)) names(dict) <- names(term.weights)
  for (n in names(dict)) {
    if (!n %in% names(bias) && any(ii <- !is.na(dict[[n]]) & dict[[n]] == bias.name)) {
      bias[n] <- term.weights[[n]][ii]
      term.weights[[n]][ii] <- 0
    }
  }
  dict_chars <- list(
    all = paste(unique(strsplit(paste0(unique(unlist(dict, use.names = FALSE)), collapse = ""), "")[[1]]),
      collapse = ""
    )
  )
  dict_chars$alpha <- gsub("[^A-Za-z]", "", dict_chars$all)
  dict_chars$case <- if (grepl("[A-Z]", dict_chars$alpha)) {
    if (grepl("[a-z]", dict_chars$alpha)) "mixed" else "upper"
  } else {
    "lower"
  }
  edtm <- substitute(dtm)
  if (is.factor(dtm)) dtm <- as.character(dtm)
  if (is.character(dtm) || !any(grepl("\\s", colnames(dtm)))) {
    if (any(grepl("\\s", unlist(dict, use.names = FALSE)))) {
      if (is.character(dtm)) {
        warning(
          "dict has terms with spaces, so using lma_patcat instead;",
          "\n  enter a dtm (e.g., lma_dtm(", paste0(edtm, collapse = ""), ")) to force lma_termcat use"
        )
        args <- list(text = dtm, dict = dict)
        if (!is.null(term.weights)) args$pattern.weights <- term.weights
        if (!is.null(bias)) args$bias <- bias
        if (!missing(glob)) args$globtoregex <- glob
        if (!missing(partial) && !partial) args$boundary <- "\\b"
        if (!missing(dir)) args$dir <- if (ckd) "" else dir
        return(do.call(lma_patcat, args))
      }
    }
    if (is.character(dtm)) {
      if (dict_chars$case == "upper") dtm <- toupper(dtm)
      dtm <- lma_dtm(dtm,
        numbers = grepl("[0-9]", dict_chars$all), punct = grepl('[_/\\?!."-]', dict_chars$all),
        to.lower = dict_chars$case == "lower"
      )
    }
  }
  if (is.null(dim(dtm))) dtm <- t(dtm)
  ats <- attributes(dtm)[c("opts", "WC", "type")]
  ats <- ats[!vapply(ats, is.null, TRUE)]
  atsn <- names(ats)
  ws <- if (is.null(term.filter)) colnames(dtm) else gsub(term.filter, "", colnames(dtm), perl = TRUE)
  if ((missing(to.lower) || !is.logical(to.lower)) && dict_chars$case != "mixed") {
    text_case <- if (any(grepl("[A-Z]", ws))) if (any(grepl("[a-z]", ws))) "mixed" else "upper" else "lower"
    if (text_case == "upper") {
      dict <- lapply(dict, toupper)
      dict_chars$case <- "upper"
    }
    to.lower <- text_case == "lower"
  }
  if (to.lower && dict_chars$case != "lower") {
    dict <- lapply(dict, tolower)
    dict_chars$case <- "lower"
  }
  if (dict_chars$case != "mixed") ws <- (if (dict_chars$case == "lower") tolower else toupper)(ws)
  odict <- dict
  boundaries <- FALSE
  formatdict <- function(dict, collapse = "|") {
    lab <- if (!escape) {
      lab <- lapply(dict, function(l) {
        if (!any(grepl("[][)(}{]", l))) {
          return(FALSE)
        }
        sl <- strsplit(l, "")
        !any(grepl("\\[.+\\]|\\(.+\\)|\\{.+\\}", l)) || any(vapply(
          sl, function(cs) {
            sum(sl == "[") != sum(sl == "]") &
              sum(sl == "{") != sum(sl == "}") &
              sum(sl == "(") != sum(sl == ")")
          },
          TRUE
        ))
      })
      Filter(isTRUE, lab)
    } else {
      logical()
    }
    if (!partial) {
      s <- "^"
      e <- "$"
    } else {
      s <- e <- ""
    }
    rec <- "([][)(}{*.^$+?\\|\\\\])"
    if (length(lab)) {
      for (l in names(lab)) dict[[l]][lab[[l]]] <- gsub("([][)(}{])", "\\\\\\1", dict[[l]][lab[[l]]])
      rec <- "([*.^$+?\\|])"
    }
    res <- if (escape) {
      lapply(dict, function(l) {
        paste0(s, gsub(rec, "\\\\\\1", l, perl = TRUE), e, collapse = collapse)
      })
    } else {
      lapply(dict, function(l) paste(paste0(s, gsub("([+*])[+*]+", "\\\\\\1+", l), e), collapse = collapse))
    }
    if (glob) {
      lapply(res, function(l) {
        gsub(paste0(
          if (s == "^") "\\" else "", s, if (escape) "\\\\" else "", "\\*|", if (escape) "\\\\" else "", "\\*", if (e == "$") {
            "\\"
          } else {
            ""
          }, e
        ), "", l)
      })
    } else {
      res
    }
  }
  for (l in dict) {
    if (!boundaries) boundaries <- !any(grepl("^\\*|\\*$", l)) && any(grepl("^\\^|\\$$", l))
    if (missing(partial) && boundaries) partial <- TRUE
    if (missing(glob) && (any(grepl("([][}{.^$+?\\|\\\\])", l)) || any(grepl("\\w\\*\\w", l)))) glob <- FALSE
    if (missing(escape) && (boundaries || any(grepl("[.])][+*]|[.+*]\\?|\\[\\^", l))) &&
      !any(grepl("[({[][^])}]*$|^[^({[]*[])}]", l))) {
      escape <- FALSE
    }
  }
  cls <- 0
  if (is.null(term.weights)) {
    cls <- structure(numeric(length(dict)), names = names(dict))
    for (cat in seq_along(dict)) {
      ccls <- tryCatch(nchar(dict[[cat]]), error = function(e) NULL)
      if (is.null(ccls)) {
        warning(
          "dict appears to be misencoded, so results may not be as expected;\n",
          'might try reading the dictionary in with encoding = "latin1"'
        )
        dict[[cat]] <- iconv(dict[[cat]], sub = "#")
        ccls <- nchar(dict[[cat]])
      }
      cls[cat] <- sum(ccls)
    }
  }
  if (any(cls > term.break)) {
    br <- function(l, e = term.break) {
      f <- ceiling(cls[[l]] / e)
      l <- length(dict[[l]])
      e <- ceiling(l / f)
      o <- lapply(seq_len(f), function(i) seq_len(e) + e * (i - 1))
      o[[f]] <- o[[f]][o[[f]] <= l]
      o
    }
    op <- matrix(0, nrow(dtm), length(dict), dimnames = list(rownames(dtm), names(dict)))
    if (coverage) cop <- op
    for (cat in names(dict)) {
      matches <- if (cls[[cat]] > term.break) {
        unique(unlist(lapply(br(cat), function(s) {
          grep(formatdict(list(dict[[cat]][s]))[[1]], ws, perl = TRUE)
        })))
      } else {
        grep(formatdict(list(dict[[cat]])), ws, perl = TRUE)
      }
      if (length(matches)) {
        su <- dtm[, matches, drop = FALSE]
        op[, cat] <- rowSums(su, na.rm = TRUE)
        if (coverage) cop[, cat] <- rowSums(su != 0, na.rm = TRUE)
      }
    }
    if (coverage) {
      colnames(cop) <- paste0("coverage_", colnames(op))
      op <- cbind(op, cop)
    }
  } else {
    if (!is.null(term.weights)) {
      dict <- formatdict(dict, NULL)
      terms <- unique(unlist(dict))
      termmap <- lapply(terms, grep, ws, perl = TRUE, value = TRUE)
      names(termmap) <- unique(unlist(odict))
      termmap <- Filter(length, termmap)
      if (is.null(dim(term.weights))) {
        op <- matrix(0, nrow(dtm), length(dict), dimnames = list(rownames(dtm), names(dict)))
        if (length(termmap)) {
          weights <- lapply(names(term.weights), function(n) {
            l <- term.weights[[n]]
            if (is.null(names(l)) && n %in% names(dict) && length(dict[[n]]) == length(l)) {
              names(term.weights[[n]]) <- dict[[n]]
              l <- term.weights[[n]]
            }
            if (any(su <- !names(termmap) %in% names(l))) l[names(termmap)[su]] <- 0
            do.call(c, lapply(names(termmap), function(p) {
              structure(rep(l[[p]], length(termmap[[p]])), names = termmap[[p]])
            }))
          })
          names(weights) <- names(term.weights)
          for (cat in names(dict)) {
            if (length(weights[[cat]])) {
              op[, cat] <- as.numeric(dtm[, names(weights[[cat]]), drop = FALSE] %*% weights[[cat]])
            }
          }
        }
      } else {
        if (length(termmap)) {
          weights <- do.call(rbind, lapply(names(termmap), function(p) {
            matrix(
              rep(as.numeric(term.weights[p, ]), length(termmap[[p]])),
              ncol = ncol(term.weights), dimnames = list(termmap[[p]], colnames(term.weights))
            )
          }))
          op <- matrix(0, nrow(dtm), ncol(weights), dimnames = list(rownames(dtm), colnames(weights)))
          for (cat in colnames(op)) {
            op[, cat] <- as.numeric(dtm[, rownames(weights), drop = FALSE] %*% weights[, cat])
          }
        } else {
          op <- matrix(0, nrow(dtm), length(dict), dimnames = list(rownames(dtm), colnames(weights)))
        }
      }
    } else {
      dict <- formatdict(dict)
      if (coverage) {
        op <- vapply(names(dict), function(cat) {
          su <- dtm[, grep(dict[[cat]], ws, perl = TRUE), drop = FALSE]
          c(rowSums(su != 0, na.rm = TRUE), rowSums(su, na.rm = TRUE))
        }, numeric(nrow(dtm) * 2))
        cop <- op[seq_len(nrow(dtm)), , drop = FALSE]
        colnames(cop) <- paste0("coverage_", names(dict))
        op <- cbind(op[-seq_len(nrow(dtm)), , drop = FALSE], cop)
      } else {
        op <- vapply(names(dict), function(cat) {
          rowSums(dtm[, grep(dict[[cat]], ws, perl = TRUE),
            drop = FALSE
          ], na.rm = TRUE)
        }, numeric(nrow(dtm)))
      }
      if (nrow(dtm) == 1) {
        op <- t(op)
        rownames(op) <- 1
      }
    }
  }
  if (!is.null(bias)) for (n in names(bias)) if (n %in% colnames(op)) op[, n] <- op[, n] + bias[[n]]
  attr(op, "WC") <- if ("WC" %in% atsn) {
    ats$WC
  } else if (all(vapply(seq_len(ncol(dtm)), function(i) {
    is.numeric(dtm[, i]) || is.integer(dtm[, i])
  }, TRUE))) {
    rowSums(dtm, na.rm = TRUE)
  } else {
    NULL
  }
  attr(op, "time") <- c(attr(dtm, "time"), termcat = proc.time()[[3]] - st)
  if ("type" %in% atsn) attr(op, "type") <- ats$type
  op
}

#' Read/Write Dictionary Files
#'
#' Read in or write dictionary files in Comma-Separated Values (.csv; weighted) or
#' Linguistic Inquiry and Word Count (.dic; non-weighted) format.
#' @param path Path to a file, a name corresponding to a file in \code{getOption('lingmatch.dict.dir')}
#'   (or \code{'~/Dictionaries'}) or one of the dictionaries available at \href{https://osf.io/y6g5b}{osf.io/y6g5b},
#'   a matrix-like object to be categorized, or a list to be formatted.
#' @param cats A character vector of category names to be returned. All categories are returned by default.
#' @param type A character indicating whether and how terms should be altered. Unspecified or matching 'asis'
#'   leaves terms as they are. Other options change wildcards to regular expressions:
#'   \code{'pattern'} (\code{'^[poi]'}) replaces initial asterisks with \code{'\\\\b\\\\w*'},
#'   and terminal asterisks with \code{'\\\\w*\\\\b'}, to match terms within raw text;
#'   for anything else, terms are padded with \code{^} and \code{$}, then those bounding marks are removed
#'   when an asterisk is present, to match tokenized terms.
#' @param as.weighted Logical; if \code{TRUE}, prevents weighted dictionaries from being converted to
#'   unweighted versions, or converts unweighted dictionaries to a binary weighted version
#'   -- a data.frame with a "term" column of unique terms, and a column for each category.
#' @param dir Path to a folder containing dictionaries, or where you would like dictionaries to be downloaded;
#'   passed to \code{\link{select.dict}} and/or \code{\link{download.dict}}.
#' @param ... Passes arguments to \code{\link{readLines}}.
#' @param term.name,category.name Strings identifying column names in \code{path} containing terms and categories
#'   respectively.
#' @param raw Logical or a character. As logical, indicates if \code{path} should be treated
#'   as a raw dictionary (as might be read in from a .dic file). As a character, replaces \code{path}
#'   as if it were read in from a file.
#' @return \code{read.dic}: A \code{list} (unweighted) with an entry for each category containing
#'   character vectors of terms, or a \code{data.frame} (weighted) with columns for terms (first, "term") and
#'   weights (all subsequent, with category labels as names).
#' @family Dictionary functions
#' @importFrom utils read.table write.table
#' @export

read.dic <- function(path, cats = NULL, type = "asis", as.weighted = FALSE, dir = getOption("lingmatch.dict.dir"), ...,
                     term.name = "term", category.name = "category", raw = FALSE) {
  if (ckd <- dir == "") dir <- "~/Dictionaries"
  if (!is.logical(raw)) {
    path <- raw
    raw <- TRUE
  }
  if (missing(path)) path <- file.choose()
  if (!raw) {
    if (is.character(path) && !any(file.exists(path)) &&
      any(file.exists(normalizePath(paste0(dir, "/", path), "/", FALSE)))) {
      path <- normalizePath(paste0(dir, "/", path), "/", FALSE)
    }
    if (is.character(path) && !any(file.exists(path))) {
      tp <- select.dict(path, dir = if (ckd) "" else dir)
      if (nrow(tp$selected) && length(path) <= nrow(tp$info)) {
        if (any(tp$selected$downloaded == "")) {
          td <- rownames(tp$selected)[tp$selected$downloaded == ""]
          if (!ckd && grepl("^$|^[yt1]|^ent", readline(paste0(
            "would you like to download ", if (length(td) == 1) "this dictionary" else "these dictionaries", "?:\n",
            sub(",(?=[^,]+$)", if (length(td) == 2) " and" else ", and", paste0(td, collapse = ", "), perl = TRUE),
            "\n(press Enter for yes): "
          )))) {
            tp$selected[td, "downloaded"] <- unlist(download.dict(td, dir = dir), use.names = FALSE)
          }
        }
        path <- tp$selected[tp$selected[, "downloaded"] != "", "downloaded"]
        if (!length(path)) {
          stop(
            if (nrow(tp$selected) == 1) "dictionary" else "dictionaries", " (",
            paste(rownames(tp$selected), collapse = ", "), ") not found in dir (", dir, ")",
            if (ckd) '\nspecify a directory (e.g., dir = "~") to locate or download; see ?download.dict',
            call. = FALSE
          )
        }
      }
    }
    if (is.character(path) && length(path) > 1 && any(file.exists(path))) {
      dicts <- list()
      for (p in path) {
        if (file.exists(p)) {
          name <- gsub("^.*[/\\]+|\\.[^.]+$", "", p)
          dicts[[name]] <- read.dic(p)
        }
      }
      path <- if (length(dicts) == 1) dicts[[1]] else dicts
    }
  }
  if (!is.null(dim(path))) {
    if (anyNA(path)) path[is.na(path)] <- 0
    cols <- colnames(path)
    if (term.name %in% cols) {
      terms <- path[, term.name]
      cols <- cols[cols != term.name]
    } else if (!is.null(rownames(path)) && any(grepl("[a-z]", rownames(path), TRUE))) {
      terms <- rownames(path)
    } else {
      su <- which(vapply(cols, function(col) !is.numeric(path[, col]), TRUE))
      if (!length(su)) {
        if (!is.null(colnames(path))) {
          path <- data.frame(term = colnames(path), t(path), stringsAsFactors = FALSE)
          terms <- path$term
          cols <- colnames(path)[-1]
          if (missing(as.weighted)) as.weighted <- TRUE
        } else {
          stop("no non-numeric columns found in path")
        }
      } else {
        if (length(su) > 1) {
          ssu <- vapply(su, function(col) {
            if (!anyDuplicated(path[, col])) {
              1
            } else if (all(path[, col] == path[1, col])) 0 else 2
          }, 0)
          if (length(su) == ncol(path) && !any(ssu == 0)) {
            path <- data.frame(
              term = unlist(path[, su], use.names = FALSE),
              category = rep(colnames(path), each = nrow(path))
            )
            category.name <- cols <- "category"
            su <- 1
          } else {
            if (any(ssu == 2)) {
              cols <- colnames(path)[su[which(ssu == 2)]]
              if (length(cols) > 1 && length(su) != ncol(path)) cols <- cols[1]
            }
            su <- if (any(ssu == 1)) su[which(ssu == 1)[[1]]] else su[[1]]
          }
        }
        terms <- path[, su]
      }
    }
    if (category.name %in% colnames(path)) {
      su <- which(vapply(cols, function(col) is.numeric(path[, col]), TRUE))
      cols <- path[, category.name]
      if (length(su) == 1) {
        weights <- path[, names(su)]
        wl <- data.frame(term = terms)
        v <- numeric(nrow(path))
        for (cat in unique(cols)) {
          su <- cols == cat
          v[su] <- weights[su]
          wl[, cat] <- v
          v[] <- 0
        }
      } else {
        wl <- split(terms, cols)
      }
    } else {
      su <- vapply(cols, function(col) {
        if (is.numeric(path[, col])) {
          1
        } else if (anyDuplicated(path[, col])) {
          if (!all(path[, col] == path[1, col])) 2 else 3
        } else {
          0
        }
      }, 0)
      wl <- if (!1 %in% su && any(su == 2) && any(su != 2)) {
        cols <- cols[su == 2]
        wl <- lapply(cols, function(col) split(terms, path[, col]))
        names(wl) <- cols
        wl
      } else {
        if (!any(su == 1)) {
          if (any(su > 1)) {
            cols <- cols[su > 1]
            if (length(cols) == 1) {
              split(terms, path[, cols])
            } else {
              wl <- lapply(cols, function(col) split(terms, path[, col]))
              names(wl) <- cols
              unlist(wl, FALSE)
            }
          } else {
            stop("no numeric columns found in path")
          }
        } else if (as.weighted) {
          cbind(term = terms, path[, cols[su == 1], drop = FALSE])
        } else {
          cols <- cols[su == 1]
          if (length(cols) == 1) {
            weights <- path[, cols]
            if (any(weights < 0) && any(weights > 0)) {
              Filter(length, list(
                positive = terms[weights > 0],
                neutral = terms[weights == 0],
                negative = terms[weights < 0]
              ))
            } else if (anyDuplicated(weights)) split(terms, weights) else list(category = terms)
          } else {
            weights <- as.data.frame(path[, cols], stringsAsFactors = FALSE)
            if (any(weights > 0) && any(weights < 0)) {
              for (col in cols) {
                if (any(weights[, col] > 0)) weights[, paste0(col, ".positive")] <- as.integer(weights[, col] > 0)
                if (any(weights[, col] == 0)) weights[, paste0(col, ".neutral")] <- as.integer(weights[, col] == 0)
                if (any(weights[, col] < 0)) weights[, paste0(col, ".negative")] <- as.integer(weights[, col] < 0)
                weights <- weights[, colnames(weights) != col]
              }
              cols <- sort(colnames(weights))
            }
            lvs <- sort(unique(unlist(weights)))
            if (length(lvs) == 2 && all(lvs == c(0, 1))) {
              wl <- lapply(cols, function(col) terms[weights[, col] == 1])
              names(wl) <- cols
              wl <- Filter(length, wl)
            } else {
              wl <- split(terms, colnames(weights)[max.col(weights, "first")])
            }
            wl
          }
        }
      }
    }
  } else if (is.list(path)) {
    path <- Filter(length, path)
    if (all(vapply(path, function(d) is.character(d) || is.factor(d), TRUE))) {
      wl <- path
      if (is.null(names(wl))) names(wl) <- paste0("cat", seq_along(wl))
      if (!is.null(cats)) {
        wl <- wl[names(wl) %in% cats]
        if (!length(wl)) stop("no cats were found in path")
      }
    } else {
      if (is.null(names(path))) names(path) <- paste0("dict", seq_along(path))
      wl <- if (any(vapply(path, function(d) !is.null(dim(d)), TRUE))) {
        terms <- NULL
        cols <- NULL
        for (d in names(path)) {
          path[[d]] <- read.dic(path[[d]], as.weighted = as.weighted)
          if (as.weighted) {
            terms <- unique(c(terms, path[[d]]$term))
            cols <- c(cols, paste0(d, ".", colnames(path[[d]])[-1]))
          }
        }
        if (as.weighted) {
          wl <- as.data.frame(
            matrix(0, length(terms), length(cols), dimnames = list(terms, cols)),
            stringsAsFactors = FALSE
          )
          for (d in names(path)) {
            cols <- colnames(path[[d]])[-1] <- paste0(d, ".", colnames(path[[d]])[-1])
            su <- duplicated(path[[d]]$term)
            if (any(su)) {
              su <- path[[d]]$term %in% path[[d]]$term[su]
              td <- path[[d]][su, , drop = FALSE]
              for (term in unique(td$term)) wl[term, cols] <- colMeans(td[td$term == term, cols])
            }
            if (any(!su)) path[[d]] <- path[[d]][!su, ]
            rownames(path[[d]]) <- path[[d]]$term
            wl[path[[d]]$term, cols] <- path[[d]][, cols]
          }
          data.frame(term = rownames(wl), wl, stringsAsFactors = FALSE)
        } else {
          unlist(path, FALSE)
        }
      } else if (any(vapply(path, is.list, TRUE))) {
        unlist(path, FALSE)
      } else {
        if (any(vapply(path, function(d) is.null(names(d)), TRUE))) {
          if (all(vapply(path, length, 0) == length(path[[1]]))) {
            data.frame(term = names(path), do.call(rbind, path), stringsAsFactors = FALSE)
          } else {
            stop("failed to resolve path; as a list, entries should contain character or named numeric vectors")
          }
        } else {
          terms <- unique(unlist(lapply(path, names), use.names = FALSE))
          v <- structure(numeric(length(terms)), names = terms)
          data.frame(term = terms, vapply(path, function(d) {
            v[names(d)] <- d
            v
          }, numeric(length(terms))), stringsAsFactors = FALSE)
        }
      }
    }
  } else {
    if (raw || length(path) != 1) {
      if (length(path) == 1) path <- strsplit(path, "\n")[[1]]
      di <- path
    } else {
      di <- tryCatch(readLines(path, warn = FALSE, ...), error = function(e) NULL)
      if (is.null(di)) stop("assumed path (", path, ") is to a file, but failed to read it in", call. = FALSE)
    }
    lst <- grep("%", di, fixed = TRUE)
    if (length(lst) > 1 && !grepl(",", di[lst[1]], fixed = TRUE)) {
      if (length(lst) < 2) {
        stop(
          "could not identify the end of the header -- ",
          "this should be the second percent sign (%) following the last category name."
        )
      }
      lst <- lst[2]
      h <- grep("^\\d", gsub("^\\s+|\\s*%+\\s*|\\s+$", "", di[seq_len(lst)]), value = TRUE)
      ci <- character()
      for (cat in h) ci[sub("\\s.*$", "", cat)] <- sub("^[^\\s]+\\s+", "", cat, perl = TRUE)
      if (is.null(cats)) cats <- ci
      sep <- if (grepl("\t", di[lst + 1], fixed = TRUE)) "\t" else "\\s"
      cb <- paste0("(?:", sep, "+(?:", paste(names(ci), collapse = "|"), ")(?=", sep, "|$))*$")
      di <- di[-seq_len(lst - 1)]
      wl <- lapply(structure(names(ci), names = ci), function(cat) {
        unique(sub(cb, "", di[grep(paste0(sep, cat, cb), di, perl = TRUE)], perl = TRUE))
      })
      wl <- wl[cats[cats %in% names(wl)]]
    } else {
      if (missing(as.weighted) && length(path) == 1) as.weighted <- TRUE
      wl <- if (any(grepl("[\\s,]", di, perl = TRUE))) {
        di <- read.table(
          text = di, header = TRUE, sep = if (grepl("\t", di[[1]])) "\t" else ",",
          quote = '"', comment.char = "", stringsAsFactors = FALSE
        )
        if (!missing(as.weighted) || (!term.name %in% colnames(di) && !any(vapply(di, is.character, TRUE)) &&
          !any(grepl("[a-z]", rownames(di), TRUE)))) {
          di <- tryCatch(
            read.dic(di, cats = cats, type = type, as.weighted = as.weighted),
            error = function(e) e$message
          )
        }
        di
      } else {
        list(cat1 = di)
      }
      if (length(wl) == 1 && is.character(wl)) {
        stop("assuming path is to a comma separated values file, but failed to read it in:\n", wl)
      }
    }
  }
  if (!missing(type) && !grepl("^[Aa]", type)) wl <- to_regex(wl, grepl("^[poi]", type, TRUE))
  if (is.null(dim(wl)) && !is.null(cats)) {
    su <- names(wl) %in% cats
    if (any(su)) wl <- wl[su]
  }
  if (as.weighted && is.null(dim(wl))) {
    op <- data.frame(term = unique(unlist(wl)), stringsAsFactors = FALSE)
    for (cat in names(wl)) op[, cat] <- as.integer(op$term %in% wl[[cat]])
    op
  } else {
    if (is.data.frame(wl) && !as.weighted && !missing(as.weighted)) {
      read.dic(wl, cats = cats, as.weighted = FALSE)
    } else {
      wl
    }
  }
}

#' @rdname read.dic
#' @param dict A \code{list} with a named entry of terms for each category, or a \code{data.frame}
#'   with terms in one column, and categories or weights in the rest.
#' @param filename The name of the file to be saved.
#' @param save Logical: if \code{FALSE}, does not write a file.
#' @return \code{write.dic}: A version of the written dictionary -- a raw character vector for
#'   unweighted dictionaries, or a \code{data.frame} for weighted dictionaries.
#' @examples
#' # make a small murder related dictionary
#' dict <- list(
#'   kill = c("kill*", "murd*", "wound*", "die*"),
#'   death = c("death*", "dying", "die*", "kill*")
#' )
#'
#' # convert it to a weighted format
#' (dict_weighted <- read.dic(dict, as.weighted = TRUE))
#'
#' # categorize it back
#' read.dic(dict_weighted)
#'
#' # convert it to a string without writing to a file
#' cat(raw_dict <- write.dic(dict, save = FALSE))
#'
#' # parse it back in
#' read.dic(raw = raw_dict)
#'
#' \dontrun{
#'
#' # save it as a .dic file
#' write.dic(dict, "murder")
#'
#' # read it back in as a list
#' read.dic("murder.dic")
#'
#' # read in the Moral Foundations or LUSI dictionaries from urls
#' moral_dict <- read.dic("https://osf.io/download/whjt2")
#' lusi_dict <- read.dic("https://osf.io/download/29ayf")
#'
#' # save and read in a version of the General Inquirer dictionary
#' inquirer <- read.dic("inquirer", dir = "~/Dictionaries")
#' }
#' @export

write.dic <- function(dict, filename = NULL, type = "asis", as.weighted = FALSE, save = TRUE) {
  if (!is.list(dict) || is.data.frame(dict)) {
    if (save && (missing(as.weighted) || as.weighted)) {
      as.weighted <- TRUE
      o <- dict
    } else {
      dict <- read.dic(dict)
    }
  }
  if (is.null(dim(dict))) {
    terms <- unique(as.character(unlist(dict, use.names = FALSE)))
    terms <- terms[terms != ""]
    if (!missing(type) && !grepl("^[Aa]", type)) dict <- to_regex(dict, grepl("^[poi]", type, TRUE))
    if (as.weighted) {
      o <- data.frame(term = terms, stringsAsFactors = FALSE)
      for (cat in names(dict)) o[, cat] <- as.integer(o$term %in% dict[[cat]])
    } else {
      l <- length(dict)
      m <- as.data.frame(matrix("", length(terms) + l + 2, l + 1), stringsAsFactors = FALSE)
      m[, 1] <- c("%", seq_len(l), "%", terms)
      m[seq_len(l) + 1, 2] <- if (is.null(names(dict))) seq_len(l) else names(dict)
      for (i in seq_along(dict)) m[which(m[-seq_len(l + 2), 1] %in% dict[[i]]) + l + 2, i + 1] <- i
      o <- gsub("\t{2,}", "\t", paste(sub("\t+$", "", do.call(paste, c(m, sep = "\t"))), collapse = "\n"))
    }
  }
  if (save && is.character(filename)) {
    filename <- filename[[1]]
    if (!grepl("\\.[^.]+$", filename)) filename <- paste0(filename, if (as.weighted) ".csv" else ".dic")
    if (as.weighted) {
      write.table(o, filename, sep = ",", row.names = FALSE, qmethod = "double")
    } else {
      write(o, filename)
    }
    message("dictionary saved to ", filename)
  }
  invisible(o)
}

#' Generate a Report of Term Matches
#'
#' Extract matches to fuzzy terms (globs/wildcards or regular expressions) from provided text, in order
#' to assess their appropriateness for inclusion in a dictionary.
#' @param dict A vector of terms, list of such vectors, or a matrix-like object to be
#' categorized by \code{\link{read.dic}}.
#' @param text A vector of text to extract matches from. If not specified, will use the terms
#' in the \code{term_map} retrieved from \code{\link{select.lspace}}.
#' @param space A vector space used to calculate similarities between term matches.
#' Name of a the space (see \code{\link{select.lspace}}), a matrix with terms as row names, or
#' \code{TRUE} to auto-select a space based on matched terms.
#' @param glob Logical; if \code{TRUE}, converts globs (asterisk wildcards) to regular expressions.
#' If not specified, this will be set automatically.
#' @param parse_phrases Logical; if \code{TRUE} (default) and \code{space} is specified, will
#' break unmatched phrases into single terms, and average across and matched vectors.
#' @param tolower Logical; if \code{FALSE}, will retain \code{text}'s case.
#' @param punct Logical; if \code{FALSE}, will remove punctuation markings in \code{text}.
#' @param special Logical; if \code{FALSE}, will attempt to replace special characters in \code{text}.
#' @param as_terms Logical; if \code{TRUE}, will treat \code{text} as terms, meaning \code{dict}
#' terms will only count as matches when matching the complete text.
#' @param bysentence Logical; if \code{TRUE}, will split \code{text} into sentences, and only
#' consider unique sentences.
#' @param as_string Logical; if \code{FALSE}, returns matches as tables rather than a string.
#' @param term_map_freq Proportion of terms to include when using the term map as a source
#' of terms. Applies when \code{text} is not specified.
#' @param term_map_spaces Number of spaces in which a term has to appear to be included.
#' Applies when \code{text} is not specified.
#' @param outFile File path to write results to, always ending in \code{.csv}.
#' @param space_dir Directory from which \code{space} should be loaded.
#' @param verbose Logical; if \code{FALSE}, will not display status messages.
#' @family Dictionary functions
#' @seealso For a more complete assessment of dictionaries, see \code{\link{dictionary_meta}()}.
#'
#' Similar information is provided in the \href{https://miserman.github.io/dictionary_builder/}{dictionary builder} web tool.
#' @note
#' Matches are extracted for each term independently, so they may not align with some implementations
#' of dictionaries. For instance, by default \code{\link{lma_patcat}} matches destructively, and sorts
#' terms by length such that shorter terms will not match the same text and longer terms that overlap.
#' Here, the match would show up for both terms.
#' @returns A \code{data.frame} of results, with a row for each unique term, and the following columns:
#' \itemize{
#'   \item \strong{\code{term}}: The originally entered term.
#'   \item \strong{\code{regex}}: The converted and applied regular expression form of the term.
#'   \item \strong{\code{categories}}: Comma-separated category names,
#'   if \code{dict} is a list with named entries.
#'   \item \strong{\code{count}}: Total number of matches to the term.
#'   \item \strong{\code{max_count}}: Number of matches to the most representative
#'   (that with the highest average similarity) variant of the term.
#'   \item \strong{\code{variants}}: Number of variants of the term.
#'   \item \strong{\code{space}}: Name of the latent semantic space, if one was used.
#'   \item \strong{\code{mean_sim}}: Average similarity to the most representative variant among terms
#'   found in the space, if one was used.
#'   \item \strong{\code{min_sim}}: Minimal similarity to the most representative variant.
#'   \item \strong{\code{matches}}: Variants, with counts and similarity (Pearson's r) to the
#'   most representative term (if a space was specified). Either in the form of a comma-separated
#'   string or a \code{data.frame} (if \code{as_string} is \code{FALSE}).
#' }
#' @examples
#' text <- c(
#'   "I am sadly homeless, and suffering from depression :(",
#'   "This wholesome happiness brings joy to my heart! :D:D:D",
#'   "They are joyous in these fearsome happenings D:",
#'   "I feel weightless now that my sadness has been depressed! :()"
#' )
#' dict <- list(
#'   sad = c("*less", "sad*", "depres*", ":("),
#'   happy = c("*some", "happ*", "joy*", "d:"),
#'   self = c("i *", "my *")
#' )
#'
#' report_term_matches(dict, text)
#' @export

report_term_matches <- function(dict, text = NULL, space = NULL, glob = TRUE,
                                parse_phrases = TRUE, tolower = TRUE, punct = TRUE, special = TRUE,
                                as_terms = FALSE, bysentence = FALSE, as_string = TRUE,
                                term_map_freq = 1, term_map_spaces = 1, outFile = NULL,
                                space_dir = getOption("lingmatch.lspace.dir"), verbose = TRUE) {
  if (missing(dict)) stop("dict must be specified", call. = FALSE)
  collapsed_terms <- FALSE
  if (is.null(text)) {
    term_map <- select.lspace(dir = space_dir, get.map = TRUE)$term_map
    if (is.null(term_map)) {
      stop(
        "term map not found; specify `space_dir` or provide text",
        call. = FALSE
      )
    }
    if (term_map_freq > 0 && term_map_freq < 1) {
      term_map <- term_map[seq(1, ceiling(nrow(term_map) * term_map_freq)), ]
    }
    if (term_map_spaces > 0 && term_map_spaces < ncol(term_map)) {
      term_map <- term_map[rowSums(term_map != 0) >= term_map_spaces, ]
    }
    collapsed_terms <- as_terms <- TRUE
    text <- paste(rownames(term_map), collapse = "  ")
  }
  if (is.null(text) && is.null(space)) stop("either text or space must be specified", call. = FALSE)
  st <- proc.time()[[3]]
  if (!is.null(text) && !as_terms) {
    if (verbose) cat("preparing text (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
    if (bysentence) text <- read.segments(text, segment.size = 1, bysentence = TRUE)$text
    if (tolower) text <- tolower(text)
    if (!punct) text <- gsub("[,_:;/\\\\.?!\"()\\{}[]|\\]", " ", text)
    if (!special) text <- lma_dict("special", as.function = gsub)(text)
    text <- unique(text)
  }
  if (verbose) cat("preparing dict (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
  if (!is.null(dim(dict)) || (is.character(dict) && length(dict) == 1 && file.exists(dict))) dict <- read.dic(dict)
  terms <- data.frame(term = unique(unlist(dict, use.names = FALSE)))
  if (missing(glob)) {
    glob <- any(grepl("^\\*", terms$term))
    if (!glob) {
      glob <- any(grepl("\\*$", terms$term))
      if (glob) glob <- !any(grepl("(?:\\\\\\w|[].})])\\*$", terms$term))
    }
  }
  rawtext <- !as_terms || collapsed_terms
  terms$regex <- to_regex(list(terms$term), TRUE, glob)[[1]]
  terms <- terms[!is.na(terms$regex) & terms$regex != "", ]
  terms$regex <- if (rawtext) paste0("\\b", terms$regex, "\\b") else paste0("^", terms$regex, "$")
  if (is.list(dict)) {
    if (is.null(names(dict))) names(dict) <- paste0("cat_", seq_along(dict))
    categories <- character(nrow(terms))
    for (cat in names(dict)) {
      su <- terms$term %in% dict[[cat]]
      if (any(su)) {
        ssu <- su & categories != ""
        categories[ssu] <- paste0(categories[ssu], ", ", cat)
        categories[su & categories == ""] <- cat
      }
    }
    terms$categories <- categories
  }
  if (verbose) cat("extracting matches (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
  matches <- extract_matches(terms$regex, text, rawtext)
  has_space <- FALSE
  space_name <- NULL
  if (!is.null(space)) {
    obs <- unique(unlist(lapply(matches, names), use.names = FALSE))
    if (is.logical(space) && space) {
      space <- select.lspace(terms = obs)$selected
      space <- if (nrow(space)) rownames(space)[1] else NULL
    }
    if (is.character(space)) {
      if (verbose) cat("loading space (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
      space_name <- space
      space <- lma_lspace(obs, space, dir = space_dir)
    }
    if (!nrow(space) || !any(obs %in% rownames(space))) space <- NULL
    if (is.null(space)) {
      warning("failed to recognize space")
    } else {
      su <- obs %in% rownames(space)
      if (parse_phrases && any(!su)) {
        phrase <- grepl("[ _/-]", obs)
        if (any(phrase)) {
          if (verbose) cat("parsing phrases (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
          split_parts <- strsplit(obs[phrase], "[ _/-]")
          parts <- unique(unlist(split_parts, use.names = FALSE))
          part_vectors <- if (is.null(space_name)) {
            if (any(parts %in% rownames(space))) space[parts[parts %in% rownames(space)]] else space[0, ]
          } else {
            lma_lspace(parts, space_name)
          }
          if (nrow(part_vectors)) {
            space_terms <- rownames(part_vectors)
            space_dim <- ncol(space)
            names(split_parts) <- obs[phrase]
            agg_vectors <- t(vapply(split_parts, function(p) {
              su <- p %in% space_terms
              if (any(su)) {
                colMeans(part_vectors[p[su], , drop = FALSE])
              } else {
                numeric(space_dim)
              }
            }, numeric(space_dim)))
            space <- rbind(space, agg_vectors[rowSums(agg_vectors) != 0, ])
          }
        }
      }
      if (is.null(space_name)) space_name <- "custom"
      has_space <- TRUE
    }
  }
  if (verbose) cat("preparing results (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
  terms <- cbind(terms, do.call(rbind, lapply(matches, function(m) {
    hits <- if (length(m)) {
      if (collapsed_terms) m[!is.na(m) & m != 0] <- 1L
      if (has_space) {
        msim <- m
        if (length(m) == 1) {
          msim[] <- 1L
        } else {
          msim[] <- NA
          su <- names(m) %in% rownames(space)
          if (sum(su) == 1) {
            msim[su] <- 1L
          } else if (any(su)) {
            ns <- names(m)[su]
            sims <- lma_simets(space[ns, ], metric = "pearson", symmetrical = TRUE)
            msim[su] <- as.numeric(sims[, which.max(colMeans(sims))])
          }
        }
        o <- order(msim, -nchar(names(m)), decreasing = TRUE)
        m <- m[o]
        if (as_string) {
          paste(paste0(names(m), " (", if (!as_terms) paste0(m, ", "), round(msim[o], 2), ")"), collapse = ", ")
        } else {
          list(as.data.frame(rbind(m, msim[o])))
        }
      } else {
        m <- m[order(m, -nchar(names(m)), decreasing = TRUE)]
        if (as_string) {
          paste(paste0(names(m), if (!as_terms) paste0(" (", m, ")")), collapse = ", ")
        } else {
          list(t(as.data.frame(m)))
        }
      }
    } else {
      if (has_space) msim <- NA
      if (as_string) "" else list(data.frame())
    }
    if (!as_string) rownames(hits[[1]]) <- NULL
    res <- if (collapsed_terms) {
      data.frame(
        count = length(m),
        max_count = if (all(is.na(m))) 0L else 1L,
        variants = length(m)
      )
    } else {
      data.frame(
        count = sum(m),
        max_count = if (all(is.na(m))) 0L else max(m, na.rm = TRUE),
        variants = length(m)
      )
    }
    if (has_space) {
      res$space <- space_name
      if (all(is.na(msim))) {
        res$mean_sim <- res$min_sim <- NA
      } else {
        res$mean_sim <- mean(msim, na.rm = TRUE)
        res$min_sim <- min(msim, na.rm = TRUE)
      }
    }
    res[["matches"]] <- hits
    res
  })))
  terms <- terms[if (has_space) order(terms$mean_sim) else order(terms$variants, decreasing = TRUE), ]
  rownames(terms) <- NULL
  if (!is.null(outFile)) {
    outFile <- paste0(sub(".csv", "", outFile, fixed = TRUE), ".csv")
    if (verbose) cat("writing results: ", outFile, " (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
    write.table(terms, outFile, sep = ",", row.names = FALSE, qmethod = "double")
  }
  if (verbose) cat("done (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
  terms
}

#' Download Dictionaries
#'
#' Downloads the specified dictionaries from \href{https://osf.io/y6g5b}{osf.io/y6g5b}.
#'
#' @param dict One or more names of dictionaries to download, or \code{'all'} for all available. See
#'  \href{https://osf.io/y6g5b/wiki/home}{osf.io/y6g5b/wiki} for more information, and a list of available dictionaries.
#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF,
#'  and compares it with that calculated from the downloaded file to check its integrity.
#' @param mode A character specifying the file write mode; default is 'wb'. See
#'  \code{\link{download.file}}.
#' @param dir Directory in which to save the dictionary; \cr default is \code{getOption('lingmatch.dict.dir')}. \cr
#'  This must be specified, or the option must be set -- use \code{\link{lma_initdirs}} to initialize a directory.
#' @param overwrite Logical; if \code{TRUE}, will replace existing files.
#' @return Path to the downloaded dictionary, or a list of such if multiple were downloaded.
#' @family Dictionary functions
#' @examples
#' \dontrun{
#'
#' download.dict("lusi", dir = "~/Dictionaries")
#' }
#' @export

download.dict <- function(
    dict = "lusi", check.md5 = TRUE, mode = "wb", dir = getOption("lingmatch.dict.dir"),
    overwrite = FALSE) {
  download.resource(
    "dict", dict,
    check.md5 = check.md5, mode = mode, dir = dir, overwrite = overwrite
  )
}

#' Categorize Texts
#'
#' Categorize raw texts using a pattern-based dictionary.
#'
#' @param text A vector of text to be categorized. Texts are padded by 2 spaces, and potentially lowercased.
#' @param dict At least a vector of terms (patterns), usually a matrix-like object with columns for terms,
#'   categories, and weights.
#' @param pattern.weights A vector of weights corresponding to terms in \code{dict}, or the column name of
#'   weights found in \code{dict}.
#' @param pattern.categories A vector of category names corresponding to terms in \code{dict}, or the column name of
#'   category names found in \code{dict}.
#' @param bias A constant to add to each category after weighting and summing. Can be a vector with names
#'   corresponding to the unique values in \code{dict[, category]}, but is usually extracted from dict based
#'   on the intercept included in each category (defined by \code{name.map['intname']}).
#' @param to.lower Logical indicating whether \code{text} should be converted to lowercase before processing.
#' @param return.dtm Logical; if \code{TRUE}, only a document-term matrix will be returned, rather than the
#'   weighted, summed, and biased category values.
#' @param exclusive Logical; if \code{FALSE}, each dictionary term is searched for in the original text.
#'   Otherwise (by default), terms are sorted by length (with longer terms being searched for first), and
#'   matches are removed from the text (avoiding subsequent matches to matched patterns).
#' @param drop.zeros logical; if \code{TRUE}, categories or terms with no matches will be removed.
#' @param boundary A string to add to the beginning and end of each dictionary term. If \code{TRUE},
#'   \code{boundary} will be set to \code{' '}, avoiding pattern matches within words. By default, dictionary
#'   terms are left as entered.
#' @param fixed Logical; if \code{FALSE}, patterns are treated as regular expressions.
#' @param globtoregex Logical; if \code{TRUE}, initial and terminal asterisks are replaced with \code{\\\\b\\\\w*}
#'   and \code{\\\\w*\\\\b} respectively. This will also set \code{fixed} to \code{FALSE} unless fixed is specified.
#' @param name.map A named character vector:
#'   \itemize{
#'     \item \strong{\code{intname}}: term identifying category biases within the term list;
#'       defaults to \code{'_intercept'}
#'     \item \strong{\code{term}}: name of the column containing terms in \code{dict}; defaults to \code{'term'}
#'   }
#'   Missing names are added, so names can be specified positional (e.g., \code{c('_int',} \code{'terms')}),
#'   or only some can be specified by name (e.g., \code{c(term =} \code{'patterns')}), leaving the rest default.
#' @param dir Path to a folder in which to look for \code{dict} if it is the name of a file to be passed to
#'   \code{\link{read.dic}}.
#' @seealso For applying term-based dictionaries (to a document-term matrix) see \code{\link{lma_termcat}()}.
#' @family Dictionary functions
#' @return A matrix with a row per \code{text} and columns per dictionary category, or (when \code{return.dtm = TRUE})
#' a sparse matrix with a row per \code{text} and column per term. Includes a \code{WC} attribute with original
#' word counts, and a \code{categories} attribute with row indices associated with each category if
#' \code{return.dtm = TRUE}.
#' @examples
#' # example text
#' text <- c(
#'   paste(
#'     "Oh, what youth was! What I had and gave away.",
#'     "What I took and spent and saw. What I lost. And now? Ruin."
#'   ),
#'   paste(
#'     "God, are you so bored?! You just want what's gone from us all?",
#'     "I miss the you that was too. I love that you."
#'   ),
#'   paste(
#'     "Tomorrow! Tomorrow--nay, even tonight--you wait, as I am about to change.",
#'     "Soon I will off to revert. Please wait."
#'   )
#' )
#'
#' # make a document-term matrix with pre-specified terms only
#' lma_patcat(text, c("bored?!", "i lo", ". "), return.dtm = TRUE)
#'
#' # get counts of sets of letter
#' lma_patcat(text, list(c("a", "b", "c"), c("d", "e", "f")))
#'
#' # same thing with regular expressions
#' lma_patcat(text, list("[abc]", "[def]"), fixed = FALSE)
#'
#' # match only words
#' lma_patcat(text, list("i"), boundary = TRUE)
#'
#' # match only words, ignoring punctuation
#' lma_patcat(
#'   text, c("you", "tomorrow", "was"),
#'   fixed = FALSE,
#'   boundary = "\\b", return.dtm = TRUE
#' )
#'
#' \dontrun{
#'
#' # read in the temporal orientation lexicon from the World Well-Being Project
#' tempori <- read.csv(paste0(
#'   "https://raw.githubusercontent.com/wwbp/lexica/master/",
#'   "temporal_orientation/temporal_orientation_lexicon.csv"
#' ))
#'
#' lma_patcat(text, tempori)
#'
#' # or use the standardized version
#' tempori_std <- read.dic("wwbp_prospection", dir = "~/Dictionaries")
#'
#' lma_patcat(text, tempori_std)
#'
#' ## get scores on the same scale by adjusting the standardized values
#' tempori_std[, -1] <- tempori_std[, -1] / 100 *
#'   select.dict("wwbp_prospection")$selected[, "original_max"]
#'
#' lma_patcat(text, tempori_std)[, unique(tempori$category)]
#' }
#' @export

lma_patcat <- function(text, dict = NULL, pattern.weights = "weight", pattern.categories = "category", bias = NULL,
                       to.lower = TRUE, return.dtm = FALSE, drop.zeros = FALSE, exclusive = TRUE, boundary = NULL, fixed = TRUE,
                       globtoregex = FALSE, name.map = c(intname = "_intercept", term = "term"), dir = getOption("lingmatch.dict.dir")) {
  text_names <- names(text)
  if (is.factor(text)) text <- as.character(text)
  if (!is.character(text)) stop("enter a character vector as the first argument")
  text <- paste(" ", text, " ")
  if (is.null(names(name.map)) && length(name.map) < 3) names(name.map) <- c("intname", "term")[seq_along(name.map)]
  wide <- FALSE
  if (missing(dict) && missing(pattern.weights) && missing(pattern.categories)) dict <- lma_dict()
  if (is.character(dict) && length(dict) == 1 && (file.exists(dict) || grepl("^[A-Za-z_]{3}", dict)) &&
    missing(pattern.weights) && missing(pattern.categories)) {
    if (dir == "") dir <- "~/Dictionaries"
    if (!any(file.exists(dict)) && any(file.exists(normalizePath(paste0(dir, "/", dict), "/", FALSE)))) {
      dict <- normalizePath(paste0(dir, "/", dict), "/", FALSE)
    }
    td <- tryCatch(read.dic(dict), error = function(e) NULL)
    dict <- if (is.null(td)) list(cat1 = dict) else td
  }
  if (!is.null(dim(dict))) {
    if (is.null(colnames(dict))) {
      colnames(dict) <- paste0("X", seq_len(ncol(dict)))
    } else {
      if (!is.data.frame(dict)) dict <- as.data.frame(as.matrix(dict), stringsAsFactors = FALSE)
      terms <- if (name.map[["term"]] %in% colnames(dict)) colnames(dict) != name.map[["term"]] else !logical(ncol(dict))
      if (missing(pattern.weights) && !any(pattern.weights %in% colnames(dict))) {
        if (any(su <- terms & vapply(dict, is.numeric, TRUE))) {
          terms <- terms & !su
          pattern.weights <- dict[, su]
        }
      }
      if (missing(pattern.categories) && !pattern.categories %in% colnames(dict)) {
        if (any(su <- terms & vapply(dict, function(v) !is.numeric(v) && anyDuplicated(v), TRUE))) {
          terms <- terms & !su
          pattern.categories <- dict[, su]
          if (sum(su) > 1) pattern.categories <- do.call(paste, pattern.categories)
        }
      }
      if (name.map[["term"]] %in% colnames(dict)) {
        dict[, name.map[["term"]]]
      } else if (!all(terms)) {
        dict <- if (any(terms)) dict[, which(terms)[1]] else rownames(dict)
      }
    }
  }
  # independently entered wide weights
  if ((is.null(dict) || is.null(dim(dict))) && (!is.null(ncol(pattern.weights)) || !is.null(ncol(pattern.categories)))) {
    weights <- if (!is.null(ncol(pattern.weights))) pattern.weights else pattern.categories
    if (!is.null(rownames(weights)) && any(grepl("[^0-9]", rownames(weights)))) {
      dict <- rownames(weights)
    } else if (is.list(dict) && (length(dict) == 1 ||
      (length(dict[[1]]) == nrow(weights) && all(vapply(dict, length, 0) == nrow(weights))))) {
      dict <- dict[[1]]
    }
    if (length(dict) != nrow(weights)) stop("dict and wide weights do not align")
    wide <- TRUE
    if (!missing(pattern.categories) && is.character(pattern.categories) && any(su <- pattern.categories %in% weights)) {
      weights <- weights[, pattern.categories[su], drop = FALSE]
    }
    weights <- weights[, vapply(seq_len(ncol(weights)), function(col) is.numeric(weights[, col]), TRUE), drop = FALSE]
    if (!ncol(weights)) stop("could not identify numeric weights in wide weights")
    lex <- list(terms = dict, weights = weights, category = colnames(weights))
    # wide weights in dict
  } else if (!is.null(dim(dict)) && (
    (length(pattern.weights) > 1 && is.character(pattern.weights)) ||
      (length(pattern.categories) > 1 &&
        (length(pattern.categories) != nrow(dict) || all(pattern.categories %in% colnames(dict)))) ||
      (!any(pattern.weights %in% colnames(dict)) && !any(pattern.categories %in% colnames(dict)))
  )) {
    if (any(su <- pattern.weights %in% colnames(dict))) {
      categories <- pattern.weights[su]
    } else if (any(su <- pattern.categories %in% colnames(dict))) {
      categories <- pattern.categories
    } else if (any(su <- vapply(colnames(dict), function(v) is.numeric(dict[, v]), TRUE))) {
      categories <- colnames(dict)[su]
    } else {
      stop("could not find weights in dict column names")
    }
    wide <- TRUE
    if (!name.map[["term"]] %in% colnames(dict)) {
      terms <- colnames(dict)[vapply(colnames(dict), function(v) !is.numeric(dict[, v]), TRUE)]
      if (!length(terms)) stop("could not find terms in dict")
      name.map[["term"]] <- if (length(terms) > 1) {
        su <- vapply(terms, function(v) !anyDuplicated(dict[, v]), TRUE)
        if (any(su)) terms[which(su)[1]] else terms[1]
      } else {
        terms
      }
    }
    lex <- list(term = dict[, name.map[["term"]]], weights = dict[, categories, drop = FALSE], category = categories)
    # independently entered weights and categories
  } else if (is.null(dim(dict))) {
    if (is.null(dict) || (is.numeric(dict) && is.null(names(dict))) || (is.list(dict) && is.numeric(dict[[1]]) &&
      is.null(names(dict[[1]])))) {
      stop("could not recognize terms in dict")
    }
    n <- length(dict)
    lex <- data.frame(
      term = if (is.character(dict)) {
        dict
      } else if (is.numeric(dict)) {
        names(dict)
      } else if (is.list(dict) &&
        is.numeric(dict[[1]])) {
        unlist(lapply(dict, names), use.names = FALSE)
      } else {
        unlist(dict, use.names = FALSE)
      },
      category = if (length(pattern.categories) == n) {
        if (is.list(dict) && !is.null(names(dict))) {
          names(dict)
        } else {
          pattern.categories
        }
      } else if (is.list(dict)) {
        rep(if (!is.null(names(dict))) {
          names(dict)
        } else {
          paste0("cat", seq_along(dict))
        }, vapply(dict, length, 0))
      } else {
        "cat1"
      },
      weights = if (is.numeric(dict)) {
        unname(dict)
      } else if (is.numeric(pattern.weights)) {
        if (!is.null(names(pattern.weights)) && is.character(dict) && all(dict %in% names(pattern.weights))) {
          pattern.weights[dict]
        } else {
          pattern.weights
        }
      } else if (is.list(dict)) {
        if (is.numeric(dict[[1]])) {
          unlist(dict, use.names = FALSE)
        } else if (is.list(pattern.weights) && is.numeric(pattern.weights[[1]])) {
          unlist(pattern.weights, use.names = FALSE)
        } else {
          1
        }
      } else {
        1
      }, stringsAsFactors = FALSE
    )
  } else {
    term <- if ("term" %in% names(name.map)) name.map[["term"]] else "term"
    en <- colnames(dict)
    if (!term %in% en) {
      su <- vapply(en, function(v) !is.numeric(dict[, v]), TRUE)
      if (any(su)) {
        term <- en[which(su)[1]]
        if (sum(su) > 1) {
          su <- su & vapply(en, function(v) !anyDuplicated(dict[, v]), TRUE)
          if (any(su)) term <- en[which(su)[1]]
        }
      } else {
        stop("could not recognize terms in dict")
      }
    }
    lex <- data.frame(
      term = dict[[term]],
      category = if (length(pattern.categories) == nrow(dict)) {
        pattern.categories
      } else if (pattern.categories %in% en) dict[[pattern.categories]] else "cat1",
      weights = if (length(pattern.weights) == nrow(dict)) {
        pattern.weights
      } else if (all(pattern.weights %in% en)) dict[[pattern.weights]] else 1, stringsAsFactors = FALSE
    )
  }
  if (any(lex$category == "")) lex[lex$category == "", "category"] <- "cat_unnamed"
  if (is.factor(lex$term)) lex$term <- as.character(lex$term)
  if (globtoregex || !fixed) {
    lex$term <- to_regex(list(lex$term), TRUE, globtoregex)[[1]]
    if (missing(fixed)) fixed <- FALSE
  }
  if (wide && return.dtm) {
    wide <- FALSE
    lex <- data.frame(term = lex$term, category = if (length(lex$category) == 1) lex$category else "all")
  }
  if (!return.dtm && is.null(bias)) {
    if (!"intname" %in% names(name.map)) name.map[["intname"]] <- "_intercept"
    if (any(su <- lex$term == name.map[["intname"]])) {
      if (wide) {
        bias <- structure(lex$weights[su, ], names = lex$categories[su])
        lex$term <- lex$term[!su]
        lex$weights <- lex$weights[!su, , drop = FALSE]
      } else {
        bias <- structure(lex[su, "weights"], names = lex[su, "category"])
        lex <- lex[!su, ]
      }
    }
  }
  if (exclusive) {
    cls <- tryCatch(-nchar(lex$term), error = function(e) NULL)
    if (is.null(cls)) {
      warning(
        "dict appears to be misencoded, so results may not be as expected;\n",
        'might try reading the dictionary in with encoding = "latin1"'
      )
      lex$term <- iconv(lex$term, sub = "#")
      cls <- -nchar(lex$term)
    }
    if (wide) {
      o <- order(cls)
      lex$term <- lex$term[o]
      lex$weights <- lex$weights[o, ]
    } else {
      lex <- lex[order(cls), ]
    }
  }
  lex$category <- factor(lex$category, unique(lex$category))
  categories <- levels(lex$category)
  if (length(bias)) {
    if (is.null(names(bias)) && length(bias) == length(categories)) names(bias) <- categories
    if (any(su <- !categories %in% names(bias))) bias[categories[su]] <- 0
  } else {
    bias <- structure(integer(length(categories)), names = categories)
  }
  bias <- bias[categories]
  if (is.logical(boundary) && boundary) boundary <- " "
  if (missing(to.lower)) {
    if (any(grepl("[A-Z]", lex$term))) {
      to.lower <- FALSE
      if (!any(grepl("[a-z]", lex$term))) text <- toupper(text)
    }
  }
  if (to.lower) text <- tolower(text)
  st <- proc.time()[[3]]
  terms <- unique(lex$term)
  if (!fixed) {
    ck <- tryCatch(
      suppressWarnings(grepl(paste0("(?:", paste(terms, collapse = "|"), ")"), "", perl = TRUE)),
      error = function(e) NULL
    )
    if (is.null(ck)) stop("terms contain invalid regular expressions", call. = FALSE)
  }
  op <- pattern_search(
    text, if (is.character(boundary)) paste0(boundary, terms, boundary) else terms,
    seq_along(terms) - 1L, fixed, exclusive
  )
  colnames(op[[1]]) <- terms
  if (return.dtm) {
    attr(op[[1]], "categories") <- lapply(categories, function(cat) {
      which(colnames(op[[1]]) %in% lex[lex$category == cat, "term"])
    })
    names(attr(op[[1]], "categories")) <- categories
  } else {
    op[[1]] <- vapply(categories, function(cat) {
      l <- if (wide) {
        data.frame(term = lex$term, weights = if (cat %in% colnames(lex$weights)) {
          lex$weights[, cat]
        } else {
          lex$weights
        }, stringsAsFactors = FALSE)
      } else {
        lex[lex$category == cat, ]
      }
      as.numeric(op[[1]][, l$term, drop = FALSE] %*% l$weights + bias[[cat]])
    }, numeric(length(text)))
    if (length(text) == 1) {
      op[[1]] <- t(op[[1]])
      rownames(op[[1]]) <- 1
    }
  }
  if (length(text_names) == nrow(op[[1]])) rownames(op[[1]]) <- text_names
  attr(op[[1]], "WC") <- op[[2]]
  attr(op[[1]], "time") <- c(patcat = proc.time()[[3]] - st)
  if (drop.zeros) op[[1]] <- op[[1]][, colSums(op[[1]]) != 0, drop = FALSE]
  op[[1]]
}

.onLoad <- function(lib, pkg) {
  if (is.null(getOption("lingmatch.lspace.dir"))) options(lingmatch.lspace.dir = "")
  if (is.null(getOption("lingmatch.dict.dir"))) options(lingmatch.dict.dir = "")
}

match_metric <- function(x) {
  mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson")
  sel <- if (is.null(x) || (length(x) == 1 && grepl(tolower(substr(x, 1, 1)), "a", fixed = TRUE))) {
    mets
  } else if (is.function(x)) {
    stop("only internal metrics are available: ", paste(mets, collapse = ", "), call. = FALSE)
  } else {
    if (is.numeric(x)) {
      mets[x]
    } else {
      if (is.call(x)) x <- eval(x)
      su <- grepl("^(?:cor|r)", x, TRUE)
      if (any(su)) x[su] <- "pearson"
      unique(unlist(lapply(substr(x, 1, 3), grep, mets, fixed = TRUE, value = TRUE)))
    }
  }
  list(all = mets, selected = sel, dummy = as.integer(mets %in% sel))
}

to_regex <- function(dict, intext = FALSE, isGlob = TRUE) {
  lapply(dict, function(l) {
    l <- gsub("([+*])[+*]+", "\\\\\\1+", sub("(?<=[^\\\\])\\\\$", "\\\\\\\\", l, perl = TRUE))
    if (isGlob) {
      if (any(ck <- grepl("(?:^|\\s+)\\*|(?<=\\w)\\*(?:$|\\s+)", l, perl = TRUE))) {
        l[ck] <- gsub("\\*", "[^\\\\s]*", gsub("([.^$?(){}[-]|\\])", "\\\\\\1", l[ck], perl = TRUE))
      }
    }
    if (any(ck <- grepl("[[({]", l) + grepl("[})]|\\]", l) == 1)) {
      l[ck] <- gsub("([([{}\\])])", "\\\\\\1", l[ck], perl = TRUE)
    }
    l
  })
}

download.resource <- function(
    type, resource, decompress = TRUE,
    check.md5 = TRUE, mode = "wb", dir = "", overwrite = FALSE) {
  if (dir == "") {
    stop(paste0(
      "specify a directory (dir), or set the ", type,
      " directory option\n(e.g., options(lingmatch.", type, ".dir = ",
      '"~/', if (type == "dict") "Dictionaries" else "Latent Semantic Space",
      '"))\nor initialize it with lma_initdirs()'
    ), call. = FALSE)
  }
  all_resources <- rownames(if (type == "dict") dict_info else lss_info)
  if (length(resource) == 1 && resource == "all") resource <- all_resources
  if (length(resource) > 1) {
    return(lapply(structure(resource, names = resource), function(d) {
      tryCatch(
        download.resource(
          type = type, resource = d, decompress = decompress,
          check.md5 = check.md5, mode = mode, dir = dir
        ),
        error = function(e) e$message
      )
    }))
  }
  dir <- normalizePath(dir, "/", FALSE)
  if (resource == "default") resource <- if (type == "dict") "lusi" else "100k_lsa"
  name <- grep(paste0("^", sub("\\..*$", "", resource)), all_resources, value = TRUE)
  if (!length(name)) {
    name <- grep(
      paste0("^", substr(resource, 1, 4)), all_resources, TRUE,
      value = TRUE
    )
  }
  if (!length(name)) {
    stop(
      type, " ", resource, " not recognized; see https://osf.io/",
      if (type == "dict") "y6g5b" else "489he", "/wiki for available resources"
    )
  } else {
    name <- name[1]
  }
  urls <- list(
    info = function(id) paste0("https://api.osf.io/v2/files/", id),
    dl = function(id) paste0("https://osf.io/download/", id),
    versions = function(id) paste0("https://osf.io/", id, "/?show=revision")
  )
  if (!dir.exists(dir)) dir.create(dir, recursive = TRUE)
  dl <- function(id, ext) {
    s <- urls$dl(id)
    o <- unique(normalizePath(paste0(
      dir, "/", name, c(ext, sub(".bz2", "", ext, fixed = TRUE))
    ), "/", FALSE))
    if (any(file.exists(o))) {
      if (overwrite) {
        unlink(o)
      } else {
        return(-1)
      }
    }
    status <- tryCatch(download.file(s, o[[1]], mode = mode), error = function(e) 1)
    if (!status && check.md5) {
      fi <- strsplit(readLines(urls$info(id), 1, TRUE, FALSE, "utf-8"), '[:,{}"]+')[[1]]
      ck <- md5sum(o[[1]])
      if (fi[which(fi == "md5") + 1] != ck) {
        warning(paste0(
          "MD5 (", ck, ") does not seem to match the one on record;\n",
          "double check and try manually downloading at ", urls$versions(id)
        ))
      }
    }
    if (status) warning("failed to download file from ", s, call. = FALSE)
    status
  }
  if (type == "lspace") {
    status <- dl(lss_info[name, "osf_terms"], "_terms.txt")
    if (status < 1) status <- dl(lss_info[name, "osf_dat"], ".dat.bz2")
    if (status < 1 && decompress) {
      if (Sys.which("bunzip2") == "") {
        warning("could not find path to bunzip2 command for decompression")
      } else {
        o <- normalizePath(paste0(dir, "/", name, ".dat.bz2"), "/", FALSE)
        if (file.exists(o)) {
          status <- tryCatch(system2("bunzip2", shQuote(path.expand(o))), error = function(e) 1)
          if (status) {
            warning(
              'failed to decompress; might try this from a system console:\n  bunzip2 "', path.expand(o), '"'
            )
          }
        }
      }
    }
    paths <- normalizePath(paste0(
      dir, "/", name, c(".dat", if (!decompress) ".bz2", "_terms.txt")
    ), "/", FALSE)
  } else {
    ext <- if (dict_info[name, "weighted"]) ".csv" else ".dic"
    status <- dl(dict_info[name, "osf"], ext)
    paths <- normalizePath(paste0(dir, "/", name, ext), "/", FALSE)
  }
  if (status < 1) {
    message(
      paste0(name, " ", type, " ", if (!status) "downloaded" else "exists", ":\n  "),
      paste(paths, collapse = "\n  ")
    )
  }
  invisible(paths)
}

#' Linguistic Matching and Accommodation
#'
#' Offers a variety of methods to assess linguistic matching or accommodation, where \emph{matching}
#' is general similarity (sometimes called \emph{homophily}), and \emph{accommodation} is some form
#' of conditional similarity (accounting for some base-rate or precedent; sometimes called
#' \emph{alignment}).
#'
#' There are a great many points of decision in the assessment of linguistic similarity and/or
#' accommodation, partly inherited from the great many point of decision inherent in the numerical
#' representation of language. Two general types of matching are implemented here as sets of
#' defaults: Language/Linguistic Style Matching (LSM; Niederhoffer & Pennebaker, 2002; Ireland &
#' Pennebaker, 2010), and Latent Semantic Analysis/Similarity (LSA; Landauer & Dumais, 1997;
#' Babcock, Ta, & Ickes, 2014). See the \code{type} argument for specifics.
#'
#' @param input Texts to be compared; a vector, document-term matrix (dtm; with terms as column names),
#'   or path to a file (.txt or .csv, with texts separated by one or more lines/rows).
#' @param comp Defines the comparison to be made:
#' \itemize{
#'   \item If a \strong{function}, this will be applied to \code{input} within each group (overall if there is
#'     no group; i.e., \code{apply(input, 2, comp)}; e.g., \code{comp = mean} would compare each text to
#'     the mean profile of its group).
#'   \item If a \strong{character} with a length of 1 and no spaces:
#'     \itemize{
#'       \item If it partially matches one of \code{lsm_profiles}'s rownames, that row will be used as the comparison.
#'       \item If it partially matches \code{'auto'}, the highest correlating \code{lsm_profiles} row will be used.
#'       \item If it partially matches \code{'pairwise'}, each text will be compared to one another.
#'       \item If it partially matches \code{'sequential'}, the last variable in \code{group} will be treated as
#'         a speaker ID (see the Grouping and Comparisons section).
#'     }
#'   \item If a \strong{character vector}, this will be processed in the same way as \code{input}.
#'   \item If a \strong{vector}, either (a) logical or factor-like (having n levels < length) and of the same length as
#'     \code{nrow(input)}, or (b) numeric or logical of length less than \code{nrow(input)}, this will be used to
#'     select a subset of \code{input} (e.g., \code{1:10} would treat the first 10 rows of \code{input} as the
#'     comparison; \code{lingmatch(text, type == 'prompt', data)} would use the texts in the \code{text} column
#'     identified by the \code{type} column as the comparison).
#'   \item If a \strong{matrix-like object} (having multiple rows and columns), or a named vector, this will
#'     be treated as a sort of dtm, assuming there are common (column) names between \code{input} and
#'     \code{comp} (e.g., if you had prompt and response texts that were already processed separately).
#' }
#' @param data A matrix-like object as a reference for column names, if variables are referred to in
#'   other arguments (e.g., \code{lingmatch(text, data = data)} would be the same as
#'   \code{lingmatch(data$text)}.
#' @param group A logical or factor-like vector the same length as \code{NROW(input)}, used to defined
#'   groups.
#' @param ... Passes arguments to \code{\link{lma_dtm}}, \code{\link{lma_weight}},
#'   \code{\link{lma_termcat}}, and/or \code{\link{lma_lspace}} (depending on \code{input} and \code{comp}),
#'   and \code{\link{lma_simets}}.
#' @param comp.data A matrix-like object as a source for \code{comp} variables.
#' @param comp.group The column name of the grouping variable(s) in \code{comp.data}; if
#'   \code{group} contains references to column names, and \code{comp.group} is not specified,
#'   \code{group} variables will be looked for in \code{comp.data}.
#' @param order A numeric vector the same length as \code{nrow(input)} indicating the order of the
#'   texts and grouping variables when the type of comparison is sequential. Only necessary if the
#'   texts are not already ordered as desired.
#' @param drop logical; if \code{TRUE}, will drop columns with a sum of 0.
#' @param all.levels logical; if \code{FALSE}, multiple groups are combined. See the Grouping and
#'   Comparisons section.
#' @param type A character at least partially matching 'lsm' or 'lsa'; applies default settings
#'   aligning with the standard calculations of each type:
#'   \tabular{ll}{
#'     LSM \tab \code{lingmatch(text, weight = 'freq', dict = lma_dict(1:9), metric = 'canberra')}\cr
#'     LSA \tab \code{lingmatch(text, weight = 'tfidf', space = '100k_lsa', metric = 'cosine')}\cr
#'   }
#' @section Grouping and Comparisons:
#' Defining groups and comparisons can sometimes be a bit complicated, and requires dataset
#' specific knowledge, so it can't always (readily) be done automatically. Variables entered in the
#' \code{group} argument are treated differently depending on their position and other arguments:
#'
#' \describe{
#'   \item{Splitting}{By default, groups are treated as if they define separate chunks of data in
#'     which comparisons should be calculated. Functions used to calculated comparisons, and
#'     pairwise comparisons are performed separately in each of these groups. For example, if you
#'     wanted to compare each text with the mean of all texts in its condition, a \code{group}
#'     variable could identify and split by condition. Given multiple grouping variables,
#'     calculations will either be done in each split (if \code{all.levels = TRUE}; applied in
#'     sequence so that groups become smaller and smaller), or once after all splits are made (if
#'     \code{all.levels = FALSE}). This makes for 'one to many' comparisons with either calculated
#'     or preexisting standards (i.e., the profile of the current data, or a precalculated profile,
#'     respectively).}
#'   \item{Comparison ID}{When comparison data is identified in \code{comp}, groups are assumed
#'     to apply to both \code{input} and \code{comp} (either both in \code{data}, or separately
#'     between \code{data} and \code{comp.data}, in which case \code{comp.group} may be needed if
#'     the same grouping variable have different names between \code{data} and \code{comp.data}).
#'     In this case, multiple grouping variables are combined into a single factor assumed to
#'     uniquely identify a comparison. This makes for 'one to many' comparisons with specific texts
#'     (as in the case of manipulated prompts or text-based conditions).}
#'   \item{Speaker ID}{If \code{comp} matches \code{'sequential'}, the last grouping variable
#'     entered is assumed to identify something like speakers (i.e., a factor with two or more
#'     levels and multiple observations per level). In this case, the data are assumed to be ordered
#'     (or ordered once sorted by \code{order} if specified). Any additional grouping variables
#'     before the last are treated as splitting groups. This can set up for probabilistic
#'     accommodation metrics. At the moment, when sequential comparisons are made within groups,
#'     similarity scores between speakers are averaged, resulting in mean matching between speakers
#'     within the group.}
#' }
#' @references
#' Babcock, M. J., Ta, V. P., & Ickes, W. (2014). Latent semantic similarity and language style
#'   matching in initial dyadic interactions. \emph{Journal of Language and Social Psychology, 33},
#'   78-88.
#'
#' Ireland, M. E., & Pennebaker, J. W. (2010). Language style matching in writing: synchrony in
#'   essays, correspondence, and poetry. \emph{Journal of Personality and Social Psychology, 99},
#'   549.
#'
#' Landauer, T. K., & Dumais, S. T. (1997). A solution to Plato's problem: The latent semantic
#'   analysis theory of acquisition, induction, and representation of knowledge.
#'   \emph{Psychological Review, 104}, 211.
#'
#' Niederhoffer, K. G., & Pennebaker, J. W. (2002). Linguistic style matching in social interaction.
#'   \emph{Journal of Language and Social Psychology, 21}, 337-360.
#' @seealso For a general text processing function, see \code{\link{lma_process}()}.
#' @return A list with processed components of the input, information about the comparison, and results of
#' the comparison:
#' \itemize{
#'   \item \strong{\code{dtm}}: A sparse matrix; the raw count-dtm, or a version of the original input
#'     if it is more processed.
#'   \item \strong{\code{processed}}: A matrix-like object; a processed version of the input
#'     (e.g., weighted and categorized).
#'   \item \strong{\code{comp.type}}: A string describing the comparison if applicable.
#'   \item \strong{\code{comp}}: A vector or matrix-like object; the comparison data if applicable.
#'   \item \strong{\code{group}}: A string describing the group if applicable.
#'   \item \strong{\code{sim}}: Result of \code{\link{lma_simets}}.
#' }
#' @examples
#' # compare single strings
#' lingmatch("Compare this sentence.", "With this other sentence.")
#'
#' # compare each entry in a character vector with...
#' texts <- c(
#'   "One bit of text as an entry...",
#'   "Maybe multiple sentences in an entry. Maybe essays or posts or a book.",
#'   "Could be lines or a column from a read-in file..."
#' )
#'
#' ## one another
#' lingmatch(texts)
#'
#' ## the first
#' lingmatch(texts, 1)
#'
#' ## the next
#' lingmatch(texts, "seq")
#'
#' ## the set average
#' lingmatch(texts, mean)
#'
#' ## other entries in a group
#' lingmatch(texts, group = c("a", "a", "b"))
#'
#' ## one another, without stop words
#' lingmatch(texts, exclude = "function")
#'
#' ## a standard average (based on function words)
#' lingmatch(texts, "auto", dict = lma_dict(1:9))
#'
#' @export
#' @import methods Matrix
#' @importFrom stats na.omit dpois ppois
#' @importFrom Rcpp sourceCpp
#' @importFrom RcppParallel RcppParallelLibs
#' @useDynLib lingmatch, .registration = TRUE

lingmatch <- function(input = NULL, comp = mean, data = NULL, group = NULL, ..., comp.data = NULL, comp.group = NULL, order = NULL,
                      drop = FALSE, all.levels = FALSE, type = "lsm") {
  inp <- as.list(substitute(...()))
  # setting up a default type if specified
  if (!missing(type) && !is.null(type)) {
    type <- if (grepl("lsm|lang|ling|style|match", type, TRUE)) "lsm" else "lsa"
    ni <- names(inp)
    if (type == "lsm" && !"dict" %in% ni) inp$dict <- lma_dict(1:9)
    if (type != "lsm" && !"space" %in% ni) inp$space <- "100k_lsa"
    if (!"metric" %in% ni) inp$metric <- if (type == "lsm") "canberra" else "cosine"
    if (is.null(attr(input, "type")) || length(attr(input, "type")) == 1) {
      if (type == "lsm" && !"percent" %in% ni) inp$percent <- TRUE
      if (type != "lsm" && !"weight" %in% ni) inp$weight <- "tfidf"
    }
  }
  mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson")
  inp$metric <- if (!is.null(inp$metric)) match_metric(inp$metric)$selected else "cosine"
  if (!length(inp$metric) || all(inp$metric == "")) inp$metric <- "cosine"
  vs <- c("input", "comp", "group", "order", "data", "comp.data", "comp.group")
  opt <- as.list(match.call(expand.dots = FALSE))[vs]
  names(opt) <- vs
  # organizing options for preprocessing
  dsp <- lapply(c("lma_dtm", "lma_weight", "lma_lspace", "lma_termcat", "lma_simets"), function(f) {
    a <- names(as.list(args(f)))
    a <- a[-c(1, length(a))]
    inp[a[a %in% names(inp)]]
  })
  names(dsp) <- c("p", "w", "m", "c", "s")
  # fetches input from data or environment
  parent_frame <- parent.frame(1)
  gv <- function(a, data = NULL) {
    ta <- a
    if (is.character(a)) {
      if (!is.null(data) && a %in% colnames(data)) {
        return(unlist(data[, a]))
      } else if (length(ta) == 1 || !any(grepl(" ", a, fixed = TRUE))) ta <- parse(text = a)
    }
    ta <- tryCatch(eval(ta, parent_frame), error = function(e) NULL)
    if (!length(ta) || (!is.null(dim(ta)) && !dim(ta)[1])) {
      ta <- tryCatch(eval(a, data, parent_frame), error = function(e) NULL)
      if (!length(ta) || (!is.null(dim(ta)) && !dim(ta)[1])) {
        ta <- tryCatch(eval(a, globalenv()), error = function(e) NULL)
        if (is.null(ta)) {
          ta <- tryCatch(eval(a, data), error = function(e) NULL)
          if (is.null(ta)) {
            p <- 2
            while (is.null(ta) && p < 99) {
              p <- p + 1
              ta <- tryCatch(eval(a, parent.frame(p)), error = function(e) NULL)
            }
          }
          if (is.null(ta)) stop("could not find ", deparse(a), call. = FALSE)
        }
      }
    }
    ta
  }
  gd <- function(a, data = NULL) {
    r <- if (is.character(a) && length(a) == 1 && grepl("\\.(?:csv|txt|tsv|tab)$", a, TRUE)) {
      if (file.exists(a)) {
        r <- if (grepl("txt$", a)) {
          readLines(a, warn = FALSE)
        } else {
          r <- read.table(a, TRUE, if (grepl("csv$", a)) "," else "\t", '"', comment.char = "")
          r[, which(!vapply(r, is.numeric, TRUE))[1]]
        }
        r[r != ""]
      } else {
        stop(a, " does not exist", call. = FALSE)
      }
    } else if (is.character(a)) a else gv(a, data)
    if (is.factor(r)) r <- as.character(r)
    if (is.character(r) && length(r) == 1 && grepl("\\.(?:csv|txt|tsv|tab)$", r, TRUE)) r <- gd(r)
    r
  }
  # weight, categorize, and/or map
  wmc <- function(a) {
    if (!is.null(colnames(a)) || (length(dsp$c) == 0 && length(dsp$m) == 0)) {
      if (length(dsp$w) != 0) a <- do.call(lma_weight, c(list(a), lapply(dsp$w, eval, parent_frame)))
      if (length(dsp$c) != 0) a <- do.call(lma_termcat, c(list(a), lapply(dsp$c, eval, parent_frame)))
      if (length(dsp$m) != 0) a <- do.call(lma_lspace, c(list(a), lapply(dsp$m, eval, parent_frame)))
    }
    a
  }
  # initial data parsing
  # input
  if (missing(input)) {
    if (!is.null(data)) {
      opt$input <- opt$data
      input <- data
    } else {
      input <- file.choose()
      opt$input <- input
    }
  }
  if (is.function(input) || ((is.list(input) || is.numeric(input)) && is.null(dim(input)))) {
    stop(
      "enter a character vector or matrix-like object as input",
      call. = FALSE
    )
  }
  if (missing(data)) {
    data <- input
    if (is.character(input)) input <- gd(opt$input)
  } else {
    input <- if (is.character(input) && all(input %in% colnames(data))) data[, input] else gd(opt$input, data)
  }
  if (!missing(group) && is.data.frame(input)) input <- as.matrix(input[, vapply(input, is.numeric, TRUE)])
  rx <- NROW(input)
  # comp
  comp_missing <- missing(comp)
  if (!comp_missing) {
    comp <- gd(opt$comp, if (missing(comp.data)) if (is.call(opt$comp)) NULL else data else comp.data)
    if (!missing(comp.data) && is.character(comp) && all(comp %in% colnames(comp.data))) comp <- comp.data[, comp]
    if (!missing(data) && is.character(comp) && all(comp %in% colnames(data))) comp <- data[, comp]
    if (is.logical(comp)) comp <- which(comp)
    if (missing(comp.data) && !is.null(colnames(comp))) comp.data <- comp
  } else if (comp_missing && missing(group) && missing(comp.data) && missing(comp.group)) {
    opt$comp <- comp <- "pairwise"
  } else {
    opt$comp <- "mean"
  }
  if (length(opt$comp) > 1) opt$comp <- deparse(opt$comp)
  if (is.factor(input)) input <- as.character(input)
  if (is.factor(comp)) {
    comp <- as.character(comp)
  } else if (is.data.frame(comp)) {
    comp <- comp[, vapply(comp, is.numeric, TRUE)]
  }
  do.wmc <- TRUE
  if ("dict" %in% names(inp) && any(class(input) %in% c("matrix", "data.frame")) &&
    is.null(attr(input, "Type"))) {
    cn <- colnames(input)
    dn <- gv(inp$dict)
    if (is.list(dn)) dn <- names(dn)
    if (any(!(ck <- dn %in% cn))) {
      cat_map <- structure(c(rep(colnames(lsm_profiles), 2), "article", "prep"), names = c(
        colnames(lsm_profiles), "personal_pronouns", "impersonal_pronouns", "articles", "auxiliary_verbs",
        "adverbs", "prepositions", "conjunctions", "negations", "quantifiers", "articles", "preps"
      ))
      cn <- sub("^liwc[ .:_-]+", "", tolower(cn))
      tr <- cn %in% names(cat_map)
      if (any(tr)) colnames(input)[tr] <- cat_map[cn[tr]]
      ck <- dn %in% colnames(input)
    }
    if (sum(ck) / length(ck) > .75) {
      inp$dict <- NULL
      if (any(!ck)) dn <- dn[ck]
      input <- input[, dn]
      do.wmc <- FALSE
      if (!comp_missing && any(class(comp) %in% c("matrix", "data.frame")) && all(dn %in% colnames(comp))) {
        comp <- comp[, dn]
      }
    }
  }
  if (!is.matrix(input) && is.character(input)) {
    # if input looks like text, seeing if other text can be added, then converting to a dtm
    if (is.character(comp) && (length(comp) > 1 || grepl(" ", comp, fixed = TRUE))) {
      input <- c(comp, input)
      comp <- seq_along(comp)
      opt$comp <- "text"
    }
    input <- do.call(lma_dtm, c(list(input), dsp$p))
  }
  if (is.data.frame(comp)) comp <- as.matrix(comp)
  cc <- if (is.numeric(comp) && (!is.null(comp.data) || is.null(dim(comp)))) {
    1
  } else if (is.character(comp)) {
    comp <- tolower(comp)
    2
  } else {
    0
  }
  # group and order
  agc <- c("c", "list", "cbind", "data.frame")
  if (missing(group) && !missing(comp.group)) {
    group <- NULL
    opt$group <- opt$comp.group
  }
  if (!missing(group) && !(is.null(colnames(data)) && rx == length(opt$group) - 1)) {
    group <- if (length(opt$group) > 1 && as.character(opt$group[1]) %in% agc &&
      !grepl("[$[]", as.character(opt$group[1]))) {
      group <- tryCatch(gv(opt$group, data), error = function(e) NULL)
      if (is.character(group) && is.null(dim(group)) && all(group %in% colnames(data))) {
        group <- data[, group]
      }
      if (is.null(group)) lapply(opt$group[-1], gv, data) else group
    } else {
      if (!is.null(colnames(data)) && is.character(opt$group) && length(opt$group) < nrow(data)) {
        if (!all(opt$group %in% colnames(data))) {
          stop("group appears to be column names, but were not found in data", call. = FALSE)
        }
        group <- data[, opt$group]
        if (!is.list(group)) group <- if (is.matrix(group)) as.data.frame(group, stringsAsFactors = FALSE) else list(group)
      } else {
        group <- gv(opt$group, data)
        if (is.factor(group)) {
          group <- as.character(group)
        } else if (is.matrix(group)) {
          group <- as.data.frame(group, row.names = FALSE, stringsAsFactors = FALSE)
        }
        if (is.null(dim(group))) list(group) else lapply(group, as.character)
      }
    }
  }
  if (!missing(comp.group) || (!is.null(comp.data) && !missing(group))) {
    cg <- opt[[if (missing(comp.group)) "group" else "comp.group"]]
    if (!is.null(cg)) {
      cg <- if (!is.null(comp.data) && length(cg) > 1 &&
        as.character(cg[1]) %in% agc && !grepl("[$[]", as.character(cg[1]))) {
        cg <- tryCatch(gv(cg, comp.data), error = function(e) NULL)
        if (is.character(cg) && all(cg %in% colnames(comp.data))) cg <- comp.data[, cg]
        if (is.null(cg)) lapply(as.character(cg[-1]), gv, comp.data) else cg
      } else if (is.character(cg)) {
        if (cg %in% colnames(comp.data)) {
          list(comp.data[, cg])
        } else {
          stop("groups not found in comp.data", call. = FALSE)
        }
      } else {
        list(gv(cg, comp.data))
      }
      if (is.list(cg) && length(cg) == 1 && !is.null(dim(cg[[1]]))) {
        cg <- as.data.frame(cg[[1]], stringsAsFactors = FALSE)
      } else if (is.character(cg) && !missing(comp.group) && all(cg %in% colnames(comp.data))) {
        cg <- comp.data[, cg]
      }
      if (!missing(comp.group) || length(if (is.list(cg)) cg[[1]] else cg) == nrow(comp.data)) {
        if (all.levels) {
          comp.group <- cg
        } else {
          comp.group <- do.call(paste, cg)
          if (length(group) > 1) {
            group <- do.call(paste, group)
            if (!is.null(comp.data) && any(ck <- !(ckg <- unique(group)) %in% unique(comp.group))) {
              if (all(ck)) {
                stop("group and comp.group had no levels in common", call. = FALSE)
              } else {
                warning("levels not found in comp.group: ", paste(ckg[ck], collapse = ", "), call. = FALSE)
                group <- group[ck <- group %in% ckg[!ck]]
                input <- input[ck, , drop = FALSE]
              }
            }
          }
        }
      }
    }
  }
  if (!missing(group)) {
    if (is.matrix(group)) group <- as.data.frame(group)
    if (length(if (is.list(group)) group[[1]] else group) != rx) {
      stop("length(group) != nrow(input)", call. = FALSE)
    }
  }
  if (!missing(order)) {
    order <- gv(opt$order, data)
    if (!is.null(order)) {
      if (length(order) == rx) {
        input <- input[order, ]
        group <- lapply(group, "[", order)
      } else {
        warning("length(order) != nrow(input), so order was not applied", call. = FALSE)
      }
    } else {
      warning("failed to apply order", call. = FALSE)
    }
  }
  if (is.character(input)) input <- matrix(as.numeric(input), rx)
  if (is.data.frame(input) && any(ckvc <- !vapply(input, is.numeric, TRUE))) {
    if (all(ckvc)) {
      for (col in seq_along(ckvc)) input[, col] <- as.numeric(input[, col])
    } else {
      input <- input[, !ckvc]
      warning("some input variables were not numeric, so they were removed")
    }
  }
  dtm <- Matrix(if (is.data.frame(input)) as.matrix(input) else input, sparse = TRUE)
  if (do.wmc) input <- wmc(input)
  if (is.null(dim(input))) {
    input <- as.matrix(input)
  }
  if (cc == 2 && (length(comp) > 1 || any(grepl(" ", comp, fixed = TRUE)))) {
    comp <- do.call(lma_dtm, c(list(comp), dsp$p))
    cc <- 1
  }
  # if comp appears to be a dtm, unifying input and comp
  if (cc == 1 && !is.null(names(comp))) comp <- t(as.matrix(comp))
  cr <- nrow(comp)
  cn <- colnames(comp)
  if (!is.null(cn)) {
    cc <- 0
    comp.data <- comp
  }
  if (drop) {
    if (sum(su <- colSums(input, na.rm = TRUE) != 0) != 0) {
      input <- input[, su, drop = FALSE]
    } else {
      stop("input is all 0s after processing", call. = FALSE)
    }
  }
  nc <- ncol(input)
  # finalizing comp
  if (is.numeric(comp) && (cc == 1 || opt$comp == "text")) {
    comp.data <- input[comp, , drop = FALSE]
    if (!missing(comp.group) && !all.levels) {
      if (!anyDuplicated(comp.group) && nrow(comp.data) == length(comp.group)) {
        rownames(comp.data) <- comp.group
      }
    } else if (nrow(comp.data) == 1) {
      comp.data <- structure(as.numeric(comp.data[1, ]),
        names = colnames(comp.data)
      )
    }
    input <- input[-comp, , drop = FALSE]
  } else if (cc == 2) {
    ckp <- FALSE
    if (grepl("^pa|^se", comp)) {
      opt$comp <- if (grepl("^pa", comp)) "pairwise" else "sequential"
    } else if (any(!is.na(p <- pmatch(comp, rownames(lsm_profiles))))) {
      opt$comp <- rownames(lsm_profiles)[p]
      ckp <- TRUE
      comp.data <- lsm_profiles[p, , drop = FALSE]
    } else if (grepl("^au", comp)) {
      p <- colMeans(input, na.rm = TRUE)
      p <- which.max(lma_simets(lsm_profiles, p, "pearson"))
      opt$comp <- paste("auto:", names(p))
      ckp <- TRUE
      comp.data <- lsm_profiles[p, , drop = FALSE]
    } else if (!comp_missing) {
      stop("`comp` not recognized", call. = FALSE)
    }
    if (ckp) {
      if (any(ckp <- !(cn <- colnames(input)) %in% (bn <- colnames(comp.data)))) {
        if (all(ckp)) stop("input and comp have no columns in common", call. = FALSE)
        if ("articles" %in% cn && !"articles" %in% bn) bn[bn == "article"] <- "articles"
        if ("preps" %in% cn && !"preps" %in% bn) bn[bn == "prep"] <- "preps"
        colnames(comp.data) <- bn
        if (any(ckp <- !cn %in% bn)) {
          warning("input columns were not found in comp: ", paste(cn[ckp], collapse = ", "), call. = FALSE)
          comp.data <- comp.data[, cn[!ckp], drop = FALSE]
        }
      } else {
        comp.data <- comp.data[, cn, drop = FALSE]
      }
    }
  } else if (!is.null(comp.data)) {
    cn <- colnames(input)
    cns <- cn[ck <- cn %in% colnames(comp.data)]
    if (!any(ck)) {
      stop("input and comp have no columns in common", call. = FALSE)
    } else if (any(!ck)) {
      warning("input columns were not found in comp: ", paste(cn[!ck], collapse = ", "), call. = FALSE)
      input <- input[, cns]
    }
    comp.data <- comp.data[, cns, drop = FALSE]
  }
  compmeanck <- opt$comp == "mean"
  sim <- speaker <- NULL
  if (!is.null(group)) {
    if (!is.null(comp.data) && (NROW(comp.data) == 1 || (is.list(group) && length(group[[1]]) != nrow(input)))) {
      group <- NULL
      warning("group does not appear to be meaningful for this comparison, so it was ignored",
        call. = FALSE
      )
    } else if (!is.list(group)) group <- list(group)
    gl <- length(group)
    if (opt$comp == "sequential") {
      speaker <- group[[gl]]
      group <- if (gl == 1) NULL else group[-gl]
      gl <- length(group)
    }
    if (gl > 1 && !all.levels) {
      group <- list(do.call(paste, group))
      gl <- 1
    }
    if (gl) {
      sim <- as.data.frame(group, stringsAsFactors = FALSE)
      colnames(sim) <- paste0("g", seq_len(gl))
      for (m in inp$metric) sim[, m] <- NA
      mets <- seq_along(inp$metric) + gl
    }
  } else if (opt$comp == "sequential" && is.null(speaker)) speaker <- seq_len(nrow(input))
  # making comparisons
  sal <- dsp$s
  ck_grouppair <- !(!is.null(group) && if (is.null(comp.group)) {
    !is.null(rownames(comp.data))
  } else {
    !anyDuplicated(comp.group)
  })
  if (ck_grouppair && !is.logical(sal$mean)) {
    sal$mean <- isTRUE(grepl("T", sal$mean, fixed = TRUE))
  }
  ckf <- is.function(comp)
  apply_comp <- function(m) {
    a <- names(as.list(args(comp)))
    if ("na.rm" %in% a) {
      apply(m, 2, comp, na.rm = TRUE)
    } else if ("na.action" %in% a) {
      apply(m, 2, comp, na.action = na.omit)
    } else {
      apply(m, 2, comp)
    }
  }
  if (is.null(group)) {
    if (!is.null(speaker)) sal$group <- speaker
    if (!is.null(comp.data)) {
      if (ckf) {
        opt$comp <- paste(if (length(opt$comp.data) > 1) deparse(opt$comp.data) else opt$comp.data, opt$comp)
        sal$b <- comp.data <- if (is.null(dim(comp.data))) {
          comp.data
        } else if (compmeanck) colMeans(comp.data, na.rm = TRUE) else apply_comp(comp.data)
      } else {
        sal$b <- comp.data
      }
    } else if (ckf) {
      sal$b <- comp.data <- if (compmeanck) {
        colMeans(input, na.rm = TRUE)
      } else {
        apply_comp(input)
      }
    }
    if (!"b" %in% names(sal) && (is.numeric(comp) || !is.null(dim(comp)))) sal$b <- comp
    sim <- do.call(lma_simets, c(list(input), sal))
  } else {
    gs <- as.character(unique(sim[, 1]))
    cks <- !is.null(speaker)
    ckc <- !is.null(comp.data)
    ckp <- cc == 2 && opt$comp == "pairwise"
    if (gl == 1) {
      if (opt$comp != "pairwise") {
        if (opt$comp == "sequential") {
          group <- sim[, 1]
          sim <- do.call(rbind, lapply(gs, function(g) {
            su <- which(group == g)
            s <- speaker[su]
            r <- if (length(su) < 2 || length(unique(s)) < 2) {
              data.frame(group = g, structure(as.list(numeric(length(mets)) + 1),
                names = inp$metric
              ), row.names = paste(su, collapse = ", "), stringsAsFactors = FALSE)
            } else {
              sal$group <- s
              r <- do.call(lma_simets, c(list(input[su, , drop = FALSE]), sal))
              rs <- as.integer(unlist(strsplit(rownames(r), "[^0-9]+")))
              rownames(r) <- strsplit(do.call(sprintf, c(
                paste(gsub("[0-9]+", "%i", rownames(r)), collapse = "|"), as.list(rs - 1 + su[1])
              )), "|", fixed = TRUE)[[1]]
              data.frame(group = g, r, stringsAsFactors = FALSE)
            }
          }))
        } else {
          if (is.null(sal$pairwise)) sal$pairwise <- ck_grouppair
          flat <- ckf || !isTRUE(sal$pairwise) || isTRUE(sal$mean)
          sal$return.list <- !flat
          if (!flat) fsim <- list()
          ckmc <- FALSE
          if (!ckc && ckf) {
            ckmc <- TRUE
            opt$comp <- paste0(if (length(opt$group) == 1) paste(opt$group, ""), "group ", opt$comp)
            comp.data <- as.data.frame(
              matrix(NA, length(gs), nc, dimnames = list(gs, colnames(input))),
              stringsAsFactors = FALSE
            )
          }
          for (g in gs) {
            su <- sim[, 1] == g
            sal$b <- NULL
            if (ckc) {
              sal$b <- comp.data[if (!is.null(comp.group)) comp.group == g else g, , drop = FALSE]
            } else {
              sal$b <- input[su, , drop = FALSE]
            }
            if (ckf && !is.null(dim(sal$b))) {
              sal$b <- if (compmeanck) colMeans(sal$b, na.rm = TRUE) else apply_comp(sal$b)
            }
            if (!is.null(sal$b) && ckmc) comp.data[g, ] <- sal$b
            if (sum(su) == 1 && is.null(sal$b)) {
              sim[su, mets] <- 1
              next
            }
            tm <- do.call(lma_simets, c(list(input[su, , drop = FALSE]), sal))
            if (flat) {
              sim[su, mets] <- tm
            } else {
              fsim[[g]] <- tm
            }
          }
          if (!flat) sim <- fsim
        }
      } else {
        ug <- unique(group[[1]])
        if (isTRUE(sal$mean)) {
          sim <- data.frame(group[[1]], NA, stringsAsFactors = FALSE)
          colnames(sim) <- c(opt$group, sal$metric)
          for (g in ug) {
            su <- group[[1]] == g
            sim[su, -1] <- if (sum(su) == 1) {
              1
            } else {
              do.call(
                lma_simets, c(list(input[su, , drop = FALSE]), sal)
              )
            }
          }
        } else {
          sim <- lapply(structure(ug, names = ug), function(g) {
            su <- group[[1]] == g
            if (sum(su) != 1) {
              do.call(lma_simets, c(list(input[su, , drop = FALSE]), sal))
            } else {
              rep(NA, length(sal$metric))
            }
          })
        }
      }
    } else if (gl > 1) {
      for (i in seq_len(gl - 1)) sim <- cbind(sim, sim[, mets])
      sug <- seq_len(gl)
      cn <- paste0("g", sug)
      mn <- length(inp$metric)
      mw <- seq_len(mn)
      colnames(sim)[-sug] <- paste0(rep(vapply(seq_along(cn), function(e) {
        paste0(cn[seq_len(e)], collapse = "_")
      }, ""), each = mn), "_", inp$metric)
      group <- vapply(sug, function(g) do.call(paste, group[seq_len(g)]), character(nrow(sim)))
      if (!missing(comp.group)) {
        comp.group <- vapply(sug, function(g) {
          do.call(paste, comp.group[seq_len(g)])
        }, character(length(comp.group[[1]])))
      }
      if (is.null(sal$pairwise)) sal$pairwise <- ck_grouppair
      flat <- ckf || isTRUE(sal$mean)
      if (!flat) fsim <- list()
      ssl <- if (is.null(speaker)) TRUE else !is.na(speaker)
      for (g in unique(sim[, 1])) {
        if (!flat && is.null(fsim[[g]])) fsim[[g]] <- list()
        su <- which(sim[, 1] == g & ssl)
        sg <- group[su, , drop = FALSE]
        sx <- input[su, , drop = FALSE]
        gck <- ckc && !missing(comp.group)
        if (gck) {
          gcsub <- comp.group[, 1] == g
          if (!any(gcsub)) {
            warning("the first comparison group has no levels in common with the first data group",
              call. = FALSE
            )
            gck <- FALSE
          }
        }
        for (s in sug) {
          usg <- unique(sg[, s])
          if (length(usg) == 1) {
            ssg <- list(sx)
            names(ssg) <- usg
          } else {
            ssg <- lapply(usg, function(ss) sx[sg[, s] == ss, , drop = FALSE])
            names(ssg) <- usg
          }
          if (length(ssg) != 0) {
            for (ssn in names(ssg)) {
              ssu <- su[sg[, s] == ssn]
              if (!flat && is.null(fsim[[g]][[ssn]])) fsim[[g]][[ssn]] <- list()
              if (cks) {
                sal$group <- speaker[ssu]
              } else if (ckf && !is.null(dim(ssg[[ssn]]))) {
                sal$b <- if (compmeanck) {
                  colMeans(ssg[[ssn]], na.rm = TRUE)
                } else {
                  apply_comp(ssg[[ssn]])
                }
              }
              csu <- gl + mw + (mn * (s - 1))
              if (gck) {
                gcsu <- comp.group[, s] == ssn & gcsub
                if (!any(gcsu)) {
                  warning(
                    "no ", paste(usg, collapse = ", "), " level found in the comparison group(s)"
                  )
                } else {
                  sal$b <- comp.data[gcsu, , drop = FALSE]
                }
              }
              ssim <- do.call(lma_simets, c(list(ssg[[ssn]]), sal))
              if (flat) {
                sim[ssu, csu] <- ssim
              } else {
                fsim[[g]][[ssn]][[colnames(sim)[csu]]] <- ssim
              }
            }
          }
        }
      }
      if (!flat) sim <- fsim
    }
  }
  list(
    dtm = dtm,
    processed = input,
    comp.type = if (!is.null(opt$comp)) {
      if (is.character(opt$comp)) {
        opt$comp
      } else {
        gsub('"', "'", as.character(deparse(opt$comp)))
      }
    },
    comp = comp.data,
    group = if (!is.null(opt$group)) {
      if (is.character(opt$group)) {
        opt$group
      } else {
        gsub('"', "'", as.character(deparse(opt$group)))
      }
    },
    sim = sim
  )
}

#' Process Text
#'
#' A wrapper to other pre-processing functions, potentially from \code{\link{read.segments}}, to \code{\link{lma_dtm}}
#' or \code{\link{lma_patcat}}, to \code{\link{lma_weight}}, then \code{\link{lma_termcat}} or \code{\link{lma_lspace}},
#' and optionally including \code{\link{lma_meta}} output.
#'
#' @param input A vector of text, or path to a text file or folder.
#' @param ... arguments to be passed to \code{\link{lma_dtm}}, \code{\link{lma_patcat}}, \code{\link{lma_weight}},
#'   \code{\link{lma_termcat}}, and/or \code{\link{lma_lspace}}. All arguments must be named.
#' @param meta Logical; if \code{FALSE}, metastatistics are not included. Only applies when raw text is available.
#'  If included, meta categories are added as the last columns, with names starting with "meta_".
#' @param coverage Logical; if \code{TRUE} and a dictionary is provided (\code{dict}),
#'  will calculate the coverage (number of unique term matches) of each dictionary category.
#' @return A matrix with texts represented by rows, and features in columns, unless there are multiple rows per output
#'  (e.g., when a latent semantic space is applied without terms being mapped) in which case only the special output
#'  is returned (e.g., a matrix with terms as rows and latent dimensions in columns).
#' @seealso If you just want to compare texts, see the \code{\link{lingmatch}()} function.
#' @examples
#' # starting with some texts in a vector
#' texts <- c(
#'   "Firstly, I would like to say, and with all due respect...",
#'   "Please, proceed. I hope you feel you can speak freely...",
#'   "Oh, of course, I just hope to be clear, and not cause offense...",
#'   "Oh, no, don't monitor yourself on my account..."
#' )
#'
#' # by default, term counts and metastatistics are returned
#' lma_process(texts)
#'
#' # add dictionary and percent arguments for standard dictionary-based results
#' lma_process(texts, dict = lma_dict(), percent = TRUE)
#'
#' # add space and weight arguments for standard word-centroid vectors
#' lma_process(texts, space = lma_lspace(texts), weight = "tfidf")
#' @export

lma_process <- function(input = NULL, ..., meta = TRUE, coverage = FALSE) {
  inp <- as.list(substitute(...()))
  funs <- c("read.segments", "lma_dtm", "lma_weight", "lma_lspace", "lma_termcat", "lma_patcat")
  allargs <- NULL
  arg_matches <- list()
  for (f in funs) {
    a <- names(as.list(args(f)))
    a <- a[c(FALSE, a[-1] %in% names(inp))]
    allargs <- c(allargs, a)
    arg_matches[[f]] <- inp[a]
  }
  dupargs <- unique(allargs[duplicated(allargs)])
  arg_checks <- vapply(arg_matches, function(l) sum(!names(l) %in% dupargs), 0)
  # identify input
  op <- NULL
  if (is.function(input)) stop("enter a character vector or matrix-like object as input")
  if (is.null(dim(input)) && is.list(input) && is.character(input[[1]])) input <- unlist(input, use.names = FALSE)
  if (is.character(input) || is.factor(input)) {
    ck_paths <- length(input) != 1 && all(file.exists(input))
    op <- if (length(arg_matches$read.segments) || ck_paths) {
      an <- names(arg_matches$read.segments)
      if (!any(grepl("path|text", an))) arg_matches$read.segments$path <- input
      do.call(read.segments, lapply(arg_matches$read.segments, eval.parent, 2))
    } else {
      data.frame(
        text = if (length(input) == 1 && file.exists(input)) readLines(input) else input, stringsAsFactors = FALSE
      )
    }
  } else {
    if (is.null(dim(input))) {
      if (is.null(names(input)) || (is.list(input) && !all(vapply(input, length, 0) == length(input[[1]])))) {
        stop("input is not of a recognized format -- should be text or a dtm-like object")
      }
      input <- if (is.list(input)) as.data.frame(input, stringsAsFactors = FALSE) else t(input)
    }
    op <- input
  }
  # process
  ck_text <- "text" %in% colnames(op) && is.character(op[, "text"])
  ck_changed <- FALSE
  if (ck_text) {
    if (arg_checks[["lma_patcat"]]) {
      if (!"return.dtm" %in% names(arg_matches$lma_patcat) && (coverage || length(arg_matches$lma_weight))) {
        arg_matches$lma_patcat$return.dtm <- TRUE
      }
      arg_matches$lma_patcat$text <- op[, "text"]
      x <- do.call(lma_patcat, lapply(arg_matches$lma_patcat, eval.parent, 2))
      ck_changed <- TRUE
    } else {
      arg_matches$lma_dtm$text <- op[, "text"]
      x <- do.call(lma_dtm, lapply(arg_matches$lma_dtm, eval.parent, 2))
      ck_changed <- TRUE
    }
  } else {
    x <- op
  }
  if (arg_checks[["lma_weight"]]) {
    arg_matches$lma_weight$dtm <- x
    x <- do.call(lma_weight, lapply(arg_matches$lma_weight, eval.parent, 2))
    attr(x, "categories") <- attr(arg_matches$lma_weight$dtm, "categories")
    ck_changed <- TRUE
  }
  if (!is.null(attr(x, "categories"))) {
    categories <- attr(x, "categories")
    xc <- as.data.frame(
      matrix(0, nrow(op), length(categories), dimnames = list(NULL, names(categories))),
      stringsAsFactors = FALSE
    )
    if (coverage) {
      cxc <- xc
      for (cat in names(categories)) {
        su <- x[, categories[[cat]], drop = FALSE]
        xc[, cat] <- rowSums(su, na.rm = TRUE)
        cxc[, cat] <- rowSums(su != 0, na.rm = TRUE)
      }
      colnames(cxc) <- paste0("coverage_", colnames(xc))
      xc <- cbind(xc, cxc)
    } else {
      for (cat in names(categories)) {
        xc[, cat] <- rowSums(x[, categories[[cat]], drop = FALSE], na.rm = TRUE)
      }
    }
    x <- xc
    ck_changed <- TRUE
  } else if (arg_checks[["lma_termcat"]] || (length(arg_matches$lma_termcat) &&
    any(names(arg_matches$lma_termcat) != "dir"))) {
    arg_matches$lma_termcat$coverage <- coverage
    arg_matches$lma_termcat$dtm <- x
    x <- do.call(lma_termcat, lapply(arg_matches$lma_termcat, eval.parent, 2))
    ck_changed <- TRUE
  }
  if (arg_checks[["lma_lspace"]]) {
    nr <- NROW(x)
    arg_matches$lma_lspace$dtm <- x
    x <- do.call(lma_lspace, lapply(arg_matches$lma_lspace, eval.parent, 2))
    if (nrow(x) != nr) {
      colnames(x) <- paste0("dim", seq_len(ncol(x)))
      return(x)
    }
    ck_changed <- TRUE
  }
  if (any(grepl("Matrix", class(x), fixed = TRUE))) x <- as.matrix(x)
  if (is.matrix(x)) x <- as.data.frame(x, stringsAsFactors = FALSE)
  op <- if (ck_text && ck_changed) cbind(op, if (is.null(dim(x))) t(x) else x) else x
  if (ck_text && meta) {
    opm <- lma_meta(op[, "text"])
    if (arg_checks[["lma_weight"]] &&
      (!"normalize" %in% names(arg_matches$lma_weight) || arg_matches$lma_weight$normalize)) {
      cols <- c(9, 14:23)
      opm_counts <- opm[, cols]
      su <- opm_counts != 0
      adj <- if ("percent" %in% names(arg_matches$lma_weight) && arg_matches$lma_weight$percent) 100 else 1
      opm_counts[su] <- opm_counts[su] / rep(opm$words, length(cols))[which(su)] * adj
      opm[, cols] <- opm_counts
    }
    colnames(opm) <- paste0("meta_", colnames(opm))
    op <- cbind(op, opm)
  }
  op
}

#' English Function Word Category and Special Character Lists
#'
#' Returns a list of function words based on the Linguistic Inquiry and Word Count 2015 dictionary
#' (in terms of category names -- words were selected independently), or a list of special characters and patterns.
#' @param ... Numbers or letters corresponding to category names: ppron, ipron, article,
#' adverb, conj, prep, auxverb, negate, quant, interrog, number, interjection, or special.
#' @param as.regex Logical: if \code{FALSE}, lists are returned without regular expression.
#' @param as.function Logical or a function: if specified and \code{as.regex} is \code{TRUE}, the selected dictionary
#' will be collapsed to a regex string (terms separated by \code{|}), and a function for matching characters to that
#' string will be returned. The regex string is passed to the matching function (\code{\link{grepl}} by default)
#' as a 'pattern' argument, with the first argument of the returned function being passed as an 'x' argument.
#' See examples.
#' @note
#' The \code{special} category is not returned unless specifically requested. It is a list of regular expression
#' strings attempting to capture special things like ellipses and emojis, or sets of special characters (those outside
#' of the Basic Latin range; \code{[^\\u0020-\\u007F]}), which can be used for character conversions.
#' If \code{special} is part of the returned list, \code{as.regex} is set to \code{TRUE}.
#'
#' The \code{special} list is always used by both \code{\link{lma_dtm}} and \code{\link{lma_termcat}}. When creating a
#' dtm, \code{special} is used to clean the original input (so that, by default, the punctuation involved in ellipses
#' and emojis are treated as different -- as ellipses and emojis rather than as periods and parens and colons and such).
#' When categorizing a dtm, the input dictionary is passed by the special lists to be sure the terms in the dtm match up
#' with the dictionary (so, for example, ": (" would be replaced with "repfrown" in both the text and dictionary).
#' @seealso To score texts with these categories, use \code{\link{lma_termcat}()}.
#' @return A list with a vector of terms for each category, or (when \code{as.function = TRUE}) a function which
#' accepts an initial "terms" argument (a character vector), and any additional arguments determined by function
#' entered as \code{as.function} (\code{\link{grepl}} by default).
#' @examples
#' # return the full dictionary (excluding special)
#' lma_dict()
#'
#' # return the standard 7 category lsm categories
#' lma_dict(1:7)
#'
#' # return just a few categories without regular expression
#' lma_dict(neg, ppron, aux, as.regex = FALSE)
#'
#' # return special specifically
#' lma_dict(special)
#'
#' # returning a function
#' is.ppron <- lma_dict(ppron, as.function = TRUE)
#' is.ppron(c("i", "am", "you", "were"))
#'
#' in.lsmcat <- lma_dict(1:7, as.function = TRUE)
#' in.lsmcat(c("a", "frog", "for", "me"))
#'
#' ## use as a stopword filter
#' is.stopword <- lma_dict(as.function = TRUE)
#' dtm <- lma_dtm("Most of these words might not be all that relevant.")
#' dtm[, !is.stopword(colnames(dtm))]
#'
#' ## use to replace special characters
#' clean <- lma_dict(special, as.function = gsub)
#' clean(c(
#'   "\u201Ccurly quotes\u201D", "na\u00EFve", "typographer\u2019s apostrophe",
#'   "en\u2013dash", "em\u2014dash"
#' ))
#' @export

lma_dict <- function(..., as.regex = TRUE, as.function = FALSE) {
  cats <- as.character(substitute(...()))
  dict <- list(
    ppron = c(
      "^dae$", "^dem$", "^eir$", "^eirself$", "^em$", "^he$", "^he'", "^her$", "^hers$", "^herself$", "^hes$",
      "^him$", "^himself$", "^hir$", "^hirs$", "^hirself$", "^his$", "^hisself$", "^i$", "^i'", "^id$", "^idc$",
      "^idgaf$", "^idk$", "^idontknow$", "^idve$", "^iirc$", "^iknow$", "^ikr$", "^ill$", "^ily$", "^im$", "^ima$",
      "^imean$", "^imma$", "^ive$", "^lets$", "^let's$", "^me$", "^methinks$", "^mine$", "^my$", "^myself$", "^omfg$",
      "^omg$", "^oneself$", "^our$", "^ours", "^she$", "^she'", "^shes$", "^thee$", "^their$", "^their'", "^theirs",
      "^them$", "^thems", "^they$", "^they'", "^theyd$", "^theyll$", "^theyve$", "^thine$", "^thou$", "^thoust$",
      "^thy$", "^thyself$", "^u$", "^u'", "^ud$", "^ull$", "^ur$", "^ure$", "^us$", "^we$", "^we'", "^weve$", "^y'",
      "^ya'", "^yall", "^yins$", "^yinz$", "^you$", "^you'", "^youd$", "^youll$", "^your$", "^youre$", "^yours$",
      "^yourself$", "^yourselves$", "^youve$", "^zer$", "^zir$", "^zirs$", "^zirself$", "^zis$"
    ),
    ipron = c(
      "^another$", "^anybo", "^anyone", "^anything", "^dat$", "^de+z$", "^dis$", "^everyb", "^everyone",
      "^everything", "^few$", "^it$", "^it'$", "^it'", "^itd$", "^itll$", "^its$", "^itself$", "^many$", "^nobod",
      "^nothing$", "^other$", "^others$", "^same$", "^somebo", "^somebody'", "^someone", "^something", "^stuff$",
      "^that$", "^that'", "^thatd$", "^thatll$", "^thats$", "^these$", "^these'", "^thesed$", "^thesell$", "^thesere$",
      "^thing", "^this$", "^this'", "^thisd$", "^thisll$", "^those$", "^those'", "^thosed$", "^thosell$", "^thosere$",
      "^what$", "^what'", "^whatd$", "^whatever$", "^whatll$", "^whats$", "^which", "^who$", "^who'", "^whod$",
      "^whoever$", "^wholl$", "^whom$", "^whomever$", "^whos$", "^whose$", "^whosever$", "^whosoever$"
    ),
    article = c("^a$", "^an$", "^da$", "^teh$", "^the$"),
    adverb = c(
      "^absolutely$", "^actively$", "^actually$", "^afk$", "^again$", "^ago$", "^ahead$", "^almost$",
      "^already$", "^altogether$", "^always$", "^angrily$", "^anxiously$", "^any$", "^anymore$", "^anyway$",
      "^anywhere$", "^apparently$", "^automatically$", "^away$", "^awhile$", "^back$", "^badly$", "^barely$",
      "^basically$", "^below$", "^brietermsy$", "^carefully$", "^causiously$", "^certainly$", "^clearly$", "^closely$",
      "^coldly$", "^commonly$", "^completely$", "^constantly$", "^continually$", "^correctly$", "^coz$", "^currently$",
      "^daily$", "^deeply$", "^definitely$", "^definitly$", "^deliberately$", "^desperately$", "^differently$",
      "^directly$", "^early$", "^easily$", "^effectively$", "^elsewhere$", "^enough$", "^entirely$", "^equally$",
      "^especially$", "^essentially$", "^etc$", "^even$", "^eventually$", "^ever$", "^every$", "^everyday$",
      "^everywhere", "^exactly$", "^exclusively$", "^extremely$", "^fairly$", "^far$", "^finally$", "^fortunately$",
      "^frequently$", "^fully$", "^further$", "^generally$", "^gently$", "^genuinely$", "^good$", "^greatly$",
      "^hardly$", "^heavily$", "^hence$", "^henceforth$", "^hereafter$", "^herein$", "^heretofore$", "^hesitantly$",
      "^highly$", "^hither$", "^hopefully$", "^hotly$", "^however$", "^immediately$", "^importantly$", "^increasingly$",
      "^incredibly$", "^indeed$", "^initially$", "^instead$", "^intensely$", "^jus$", "^just$", "^largely$", "^lately$",
      "^least$", "^legitimately$", "^less$", "^lightly$", "^likely$", "^literally$", "^loudly$", "^luckily$",
      "^mainly$", "^maybe$", "^meanwhile$", "^merely$", "^more$", "^moreover$", "^most$", "^mostly$", "^much$",
      "^namely$", "^naturally$", "^nearly$", "^necessarily$", "^nervously$", "^never$", "^nevertheless$", "^no$",
      "^nonetheless$", "^normally$", "^not$", "^notwithstanding$", "^obviously$", "^occasionally$", "^often$", "^once$",
      "^only$", "^originally$", "^otherwise$", "^overall$", "^particularly$", "^passionately$", "^perfectly$",
      "^perhaps$", "^personally$", "^physically$", "^please$", "^possibly$", "^potentially$", "^practically$",
      "^presently$", "^previously$", "^primarily$", "^probability$", "^probably$", "^profoundly$", "^prolly$",
      "^properly$", "^quickly$", "^quietly$", "^quite$", "^randomly$", "^rarely$", "^rather$", "^readily$", "^really$",
      "^recently$", "^regularly$", "^relatively$", "^respectively$", "^right$", "^roughly$", "^sadly$", "^seldomly$",
      "^seriously$", "^shortly$", "^significantly$", "^similarly$", "^simply$", "^slightly$", "^slowly$", "^so$",
      "^some$", "^somehow$", "^sometimes$", "^somewhat$", "^somewhere$", "^soon$", "^specifically$", "^still$",
      "^strongly$", "^subsequently$", "^successfully$", "^such$", "^suddenly$", "^supposedly$", "^surely$",
      "^surprisingly$", "^technically$", "^terribly$", "^thence$", "^thereafter$", "^therefor$", "^therefore$",
      "^thither$", "^thoroughly$", "^thus$", "^thusfar$", "^thusly$", "^together$", "^too$", "^totally$", "^truly$",
      "^typically$", "^ultimately$", "^uncommonly$", "^unfortunately$", "^unfortunatly$", "^usually$", "^vastly$",
      "^very$", "^virtually$", "^well$", "^whence$", "^where", "^wherefor", "^whither$", "^wholly$", "^why$", "^why'",
      "^whyd$", "^whys$", "^widely$", "^wither$", "^yet$"
    ),
    conj = c(
      "^also$", "^altho$", "^although$", "^and$", "^b/c$", "^bc$", "^because$", "^besides$", "^both$", "^but$",
      "^'cause$", "^cos$", "^cuz$", "^either$", "^else$", "^except$", "^for$", "^how$", "^how'", "^howd$", "^howll$",
      "^hows$", "^if$", "^neither$", "^nor$", "^or$", "^than$", "^tho$", "^though$", "^unless$", "^unlike$", "^versus$",
      "^vs$", "^when$", "^when'", "^whenever$", "^whereas$", "^whether$", "^while$", "^whilst$"
    ),
    prep = c(
      "^about$", "^above$", "^abt$", "^across$", "^acrost$", "^afk$", "^after$", "^against$", "^along$", "^amid",
      "^among", "^around$", "^as$", "^at$", "^atop$", "^before$", "^behind$", "^beneath$", "^beside$", "^betwe",
      "^beyond$", "^by$", "^despite$", "^down$", "^during$", "^excluding$", "^from$", "^here$", "^here'", "^heres$",
      "^in$", "^including$", "^inside$", "^into$", "^minus$", "^near$", "^now$", "^of$", "^off$", "^on$", "^onto$",
      "^out$", "^outside$", "^over$", "^plus$", "^regarding$", "^sans$", "^since$", "^then$", "^there$", "^there'",
      "^thered$", "^therell$", "^theres$", "^through$", "^throughout$", "^thru$", "^til$", "^till$", "^to$", "^toward",
      "^under$", "^underneath$", "^until$", "^untill$", "^unto$", "^up$", "^upon$", "^via$", "^with$", "^within$",
      "^without$", "^worth$"
    ),
    auxverb = c(
      "^am$", "^are$", "^arent$", "^aren't$", "^be$", "^been$", "^bein$", "^being$", "^brb$", "^can$",
      "^could$", "^could'", "^couldnt$", "^couldn't$", "^couldve$", "^did$", "^didnt$", "^didn't$", "^do$", "^does$",
      "^doesnt$", "^doesn't$", "^doing$", "^dont$", "^don't$", "^had$", "^hadnt$", "^hadn't$", "^has$", "^hasnt$",
      "^hasn't$", "^have$", "^havent$", "^haven't$", "^having$", "^is$", "^isnt$", "^isn't$", "^may$", "^might$",
      "^might'", "^mightnt$", "^mightn't$", "^mightve$", "^must$", "^mustnt$", "^mustn't$", "^mustve$", "^ought",
      "^shant$", "^shan't$", "^sha'nt$", "^shall$", "^should$", "^shouldnt$", "^shouldn't$", "^shouldve$", "^was$",
      "^wasnt$", "^wasn't$", "^were$", "^werent$", "^weren't$", "^will$", "^would$", "^would'", "^wouldnt", "^wouldn't",
      "^wouldve$"
    ),
    negate = c(
      "^ain't$", "^aint$", "^aren't$", "^arent$", "^can't$", "^cannot$", "^cant$", "^couldn't$", "^couldnt$",
      "^didn't$", "^didnt$", "^doesn't$", "^doesnt$", "^don't$", "^dont$", "^hadn't$", "^hadnt$", "^hasn't$", "^hasnt$",
      "^haven't$", "^havent$", "^idk$", "^isn't$", "^isnt$", "^must'nt$", "^mustn't$", "^mustnt$", "^nah", "^need'nt$",
      "^needn't$", "^neednt$", "^negat", "^neither$", "^never$", "^no$", "^nobod", "^noes$", "^none$", "^nope$",
      "^nor$", "^not$", "^nothing$", "^nowhere$", "^np$", "^ought'nt$", "^oughtn't$", "^oughtnt$", "^shant$",
      "^shan't$", "^sha'nt$", "^should'nt$", "^shouldn't$", "^shouldnt$", "^uh-uh$", "^wasn't$", "^wasnt$", "^weren't$",
      "^werent$", "^without$", "^won't$", "^wont$", "^wouldn't$", "^wouldnt$"
    ),
    quant = c(
      "^add$", "^added$", "^adding$", "^adds$", "^all$", "^allot$", "^alot$", "^amount$", "^amounts$",
      "^another$", "^any$", "^approximat", "^average$", "^bit$", "^bits$", "^both$", "^bunch$", "^chapter$", "^couple$",
      "^doubl", "^each$", "^either$", "^entire", "^equal", "^every$", "^extra$", "^few$", "^fewer$", "^fewest$",
      "^group", "^inequal", "^least$", "^less$", "^lot$", "^lotof$", "^lots$", "^lotsa$", "^lotta$", "^majority$",
      "^many$", "^mo$", "^mo'", "^more$", "^most$", "^much$", "^mucho$", "^multiple$", "^nada$", "^none$", "^part$",
      "^partly$", "^percent", "^piece$", "^pieces$", "^plenty$", "^remaining$", "^sampl", "^scarce$", "^scarcer$",
      "^scarcest$", "^section$", "^segment", "^series$", "^several", "^single$", "^singles$", "^singly$", "^some$",
      "^somewhat$", "^ton$", "^tons$", "^total$", "^triple", "^tripling$", "^variety$", "^various$", "^whole$"
    ),
    interrog = c(
      "^how$", "^how'd$", "^how're$", "^how's$", "^howd$", "^howre$", "^hows$", "^wat$", "^wattt", "^what$",
      "^what'd$", "^what'll$", "^what're$", "^what's$", "^whatd$", "^whatever$", "^whatll$", "^whatre$", "^whatt",
      "^when$", "^when'", "^whence$", "^whenever$", "^where$", "^where'd$", "^where's$", "^wherefore$", "^wherever$",
      "^whether$", "^which$", "^whichever$", "^whither$", "^who$", "^who'd$", "^who'll$", "^who's$", "^whoever$",
      "^wholl$", "^whom$", "^whomever$", "^whos$", "^whose$", "^whosever$", "^whoso", "^why$", "^why'", "^whyever$",
      "^wut$"
    ),
    number = c(
      "^billion", "^doubl", "^dozen", "^eight", "^eleven$", "^fift", "^first$", "^firstly$", "^firsts$",
      "^five$", "^four", "^half$", "^hundred", "^infinit", "^million", "^nine", "^once$", "^one$", "^quarter",
      "^second$", "^seven", "^single$", "^six", "^ten$", "^tenth$", "^third$", "^thirt", "^thousand", "^three$",
      "^trillion", "^twel", "^twent", "^twice$", "^two$", "^zero$", "^zillion"
    ),
    interjection = c(
      "^a+h+$", "^a+w+$", "^allas$", "^alright", "^anyhoo$", "^anyway[ysz]", "^bl[eh]+$", "^g+[eah]+$",
      "^h[ah]+$", "^h[hu]+$", "^h[mh]+$", "^l[ol]+$", "^m[hm]+$", "^meh$", "^o+h+$", "^o+k+$", "^okie", "^oo+f+$",
      "^soo+$", "^u[uh]+$", "^u+g+h+$", "^w[ow]+$", "^wee+ll+$", "^y[aes]+$", "^ya+h+$", "^yeah$", "^yus+$"
    ),
    special = list(
      ELLIPSIS = "\\.{3, }|\\. +\\. +[. ]+",
      SMILE = "\\s(?:[[{(<qd]+[\\s<-]*[;:8=]|[;:8=][\\s>-]*[]})>Dpb]+|[uUnwWmM^=+-]_[uUnwWmM^=+-])(?=\\s)",
      FROWN = "\\s(?:[]D)}>]+[\\s.,<-]*[;:8=]|[;:8=][\\s.,>-]*[[{(<]+|[Tt:;]_[Tt;:]|[uUtT;:][mMn][uUtT;:])(?=\\s)",
      LIKE = c(
        "(?<=could not) like\\b", "(?<=did not) like\\b", "(?<=did) like\\b", "(?<=didn't) like\\b",
        "(?<=do not) like\\b", "(?<=do) like\\b", "(?<=does not) like\\b", "(?<=does) like\\b", "(?<=doesn't) like\\b",
        "(?<=don't) like\\b", "(?<=i) like\\b", "(?<=should not) like\\b", "(?<=they) like\\b", "(?<=we) like\\b",
        "(?<=will not) like\\b", "(?<=will) like\\b", "(?<=won't) like\\b", "(?<=would not) like\\b",
        "(?<=you) like\\b"
      ),
      CHARACTERS = c(
        ` ` = "\\s",
        `'` = paste0(
          "[\u00B4\u2018\u2019\u201A\u201B\u2032\u2035\u02B9\u02BB\u02BE\u02BF\u02C8\u02CA\u02CB\u02F4",
          "\u0300\u0301\u030D\u0312\u0313\u0314\u0315\u031B\u0321\u0322\u0326\u0328\u0329\u0340\u0341\u0343\u0351",
          "\u0357]"
        ),
        `"` = "[\u201C\u201D\u201E\u201F\u2033\u2034\u2036\u2037\u2057\u02BA\u02DD\u02EE\u02F5\u02F6\u030B\u030F]",
        `...` = "\u2026",
        `-` = "[\u05BE\u1806\u2010\u2011\u2013\uFE58\uFE63\uFF0D]",
        ` - ` = "[\u2012\u2014\u2015\u2E3A\u2E3B]|--+",
        a = paste0(
          "[\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u0100\u0101\u0102",
          "\u0103\u0104\u105\u0200\u0201\u0202\u0203\u0226\u0227\u0245\u0250\u0251\u0252\u0255\u0363\u0386\u0391",
          "\u0410\u0430]"
        ),
        ae = "[\u00C6\u00E6\u0152\u0153\u0276]",
        b = paste0(
          "[\u00DF\u0180\u0181\u0182\u0183\u0184\u0185\u0186\u0187\u0188\u0189\u018A\u018B\u018C\u0243",
          "\u0253\u0299\u0411\u0412\u0431\u0432\u0462\u0463\u0494\u0495\u212C]"
        ),
        c = paste0(
          "[\u00C7\u00E7\u0106\u0107\u0108\u0109\u0186\u0187\u0188\u0254\u0297\u0368\u0421\u0441\u2102",
          "\u2103]"
        ),
        d = paste0(
          "[\u00D0\u00DE\u00FE\u010D\u010E\u010F\u0110\u0111\u0189\u0221\u0256\u0256\u0257\u0369\u0392",
          "\u0434\u0500\u2145\u2146]"
        ),
        e = paste0(
          "[\u00C8\u00C9\u00CA\u00CB\u00E8\u00E9\u00EA\u00EB\u0112\u0113\u0114\u0115\u0116\u0117\u0118",
          "\u0119\u011A\u011B\u018E\u018F\u0190\u0204\u0205\u0206\u0207\u0228\u0229\u0246\u0247\u0258\u0259\u0364",
          "\u0388\u0395\u0400\u0401\u0404\u0415\u0417\u0435\u0437\u0450\u0451\u0454\u0498\u0499\u2107\u2108\u2128",
          "\u212E\u212F\u2130\u2147]"
        ),
        f = "[\u0191\u0192\u0492\u0493\u2109\u2231\u2132\u214E]",
        g = "[\u011C\u011D\u011E\u011F\u0120\u0121\u0122\u0123\u0193\u0222\u0260\u0261\u0262\u210A\u2141]",
        h = "[\u0124\u0125\u0127\u0195\u0266\u0267\u0389\u0397\u0452\u210B\u210C\u210D\u210E\u210F]",
        i = paste0(
          "[\u00CC\u00CD\u00CE\u00CF\u00EC\u00ED\u00EE\u00EF\u0128\u0129\u012A\u012B\u012C\u012D\u012E\u012F",
          "\u0130\u0131\u0197\u019A\u0208\u0209\u0365\u0390\u0399\u0406\u0407\u0456\u0457]"
        ),
        j = "[\u0135\u0236\u0237\u0248\u0249\u0408\u0458\u2129\u2139\u2149]",
        k = "[\u0137\u0138\u0198\u0199\u212A]",
        l = "[\u0139\u013A\u013B\u013C\u013D\u013E\u013F\u0140\u0141\u0142\u0234]",
        m = "[\u0271\u0460\u2133]",
        n = paste0(
          "[\u00D1\u00F1\u0143\u0144\u0145\u0146\u0147\u0148\u0149\u014A\u014B\u0220\u0235\u0272\u0273",
          "\u0274\u0376\u0377\u0418\u0419\u0438\u0439\u2115\u2135]"
        ),
        h = "\u0149",
        o = paste0(
          "[\u00D2\u00D3\u00D4\u00D5\u00D6\u00D8\u00F0\u00F2\u00F3\u00F4\u00F5\u00F6\u00F8\u014C\u014D",
          "\u014E\u014F\u0150\u0151\u0150\u0151\u0230\u0231\u0275\u0298\u0366\u0398\u0424\u0444\u0472\u0473\u2134]"
        ),
        p = "[\u0420\u0440\u2117\u2118\u2119]",
        q = "[\u018D\u211A\u213A]",
        r = paste0(
          "[\u0154\u0155\u0156\u0157\u0158\u0159\u0211\u0212\u0213\u0279\u0280\u0281\u0433\u0453\u0490",
          "\u0491\u211B\u211C\u211D\u211F\u213E]"
        ),
        s = "[\u015A\u015C\u015D\u015E\u015F\u0160\u0161\u0160\u0161\u0218\u0219\u0405\u0455]",
        t = "[\u0162\u0163\u0164\u0165\u0166\u0167\u0371\u0373\u0422\u0442]",
        u = paste0(
          "[\u00D9\u00DA\u00DB\u00DC\u00F9\u00FA\u00FB\u00FC\u00FC\u0168\u0169\u016A\u016B\u016C\u016D",
          "\u016E\u016F\u0170\u0171\u0172\u0173\u01D3\u01D4\u01D5\u01D6\u01D7\u01D8\u01D9\u01DA\u01DB\u01DC\u0214",
          "\u0217\u0244\u0289\u0367\u0426\u0446]"
        ),
        v = "[\u0474\u0475\u0476\u0477]",
        w = "[\u0174\u0175\u0270\u0428\u0429\u0448\u0449\u0461]",
        y = "[\u00DD\u00FD\u00FF\u0176\u0177\u0178\u0232\u0233\u0423\u0427\u0443\u0447]",
        z = "[\u0179\u017A\u017B\u017C\u017E\u0224\u0225\u0240\u0290\u0291\u0396\u2124]",
        x = "[\u00D7\u0416\u0425\u0436\u0445\u0496\u0497]"
      ),
      SYMBOLS = c(
        `(cc)` = "\u00A9",
        number = "\u2116",
        sm = "\u2120",
        tel = "\u2121",
        `(tm)` = "\u2122",
        omega = "\u2126",
        alpha = "\u2127",
        fax = "\u213B",
        pi = "[\u213C\u213F]",
        sigma = "\u2140"
      )
    )
  )
  if (length(cats) == 0) cats <- names(dict)[-length(dict)]
  if (length(cats) == 1 && grepl("\\(|\\[", cats)) cats <- eval(parse(text = cats))
  if (any(grepl("[0-9]|seq", cats))) cats <- if (length(cats) > 1) as.numeric(cats) else eval(parse(text = cats))
  if (is.numeric(cats)) {
    cats <- cats[cats < length(dict)]
  } else if (any(!cats %in% names(dict))) cats <- grep(paste(paste0("^", cats), collapse = "|"), names(dict), value = TRUE)
  if (length(cats) == 0) {
    stop(
      "\n  enter numbers between 1 and ", length(dict) - 1,
      ", or letters matching a category:\n  ", paste(names(dict), collapse = ", ")
    )
  }
  if ("special" %in% cats) as.regex <- TRUE
  if (as.regex) {
    if (!missing(as.function)) {
      if ("special" %in% cats && is.function(as.function) && grepl("sub", substitute(as.function))) {
        dict <- c(dict$special$CHARACTERS, dict$special$SYMBOLS)
        fun <- as.function
        if (substitute(as.function) == "gsub") {
          charmap <- as.data.frame(unlist(lapply(as.list(dict), strsplit, "")), stringsAsFactors = FALSE)
          charmap <- data.frame(to = sub("[0-9]+", "", rownames(charmap)), from = charmap[[1]], stringsAsFactors = FALSE)
          charmap <- charmap[grepl("^\\w$", charmap$to) & !charmap$from %in% c("[", "]"), ]
          dict <- dict[!names(dict) %in% charmap$to]
          charmap <- list(to = paste(charmap$to, collapse = ""), from = paste(charmap$from, collapse = ""))
        } else {
          charmap <- NULL
        }
        function(terms, ...) {
          args <- list(...)
          args$x <- terms
          if (!is.null(charmap)) {
            args$x <- tryCatch(chartr(charmap$from, charmap$to, args$x), error = function(e) NULL)
            if (is.null(args$x)) {
              args$x <- chartr(charmap$from, charmap$to, iconv(terms, sub = "#"))
              warning("the input appears to be misencoded; it was converted, but may have errant #s")
            }
          }
          for (s in names(dict)) {
            args$pattern <- dict[s]
            args$replacement <- s
            args$x <- do.call(fun, args)
          }
          args$x
        }
      } else {
        dict <- paste(unlist(dict[cats]), collapse = "|")
        fun <- if (is.function(as.function)) as.function else grepl
        function(terms, ...) {
          args <- list(...)
          args$pattern <- dict
          args$x <- terms
          if (!is.function(as.function) && !"perl" %in% names(args)) args$perl <- TRUE
          do.call(fun, args)
        }
      }
    } else {
      dict[cats]
    }
  } else {
    lapply(dict[cats], function(l) gsub("\\^|\\$", "", sub("(?<=[^$])$", "*", l, perl = TRUE)))
  }
}

#' Calculate Text-Based Metastatistics
#'
#' Calculate simple descriptive statistics from text.
#'
#' @param text A character vector of texts.
#' @return A data.frame:
#'   \itemize{
#'     \item \strong{\code{characters}}: Total number of characters.
#'     \item \strong{\code{syllables}}: Total number of syllables, as estimated by split length of \cr
#'       \code{'a+[eu]*|e+a*|i+|o+[ui]*|u+|y+[aeiou]*'} - 1.
#'     \item \strong{\code{words}}: Total number of words (raw word count).
#'     \item \strong{\code{unique_words}}: Number of unique words (binary word count).
#'     \item \strong{\code{clauses}}: Number of clauses, as marked by commas, colons, semicolons, dashes, or brackets
#'       within sentences.
#'     \item \strong{\code{sentences}}: Number of sentences, as marked by periods, question marks, exclamation points,
#'       or new line characters.
#'     \item \strong{\code{words_per_clause}}: Average number of words per clause.
#'     \item \strong{\code{words_per_sentence}}: Average number of words per sentence.
#'     \item \strong{\code{sixltr}}: Number of words 6 or more characters long.
#'     \item \strong{\code{characters_per_word}}: Average number of characters per word
#'       (\code{characters} / \code{words}).
#'     \item \strong{\code{syllables_per_word}}: Average number of syllables per word
#'       (\code{syllables} / \code{words}).
#'     \item \strong{\code{type_token_ratio}}: Ratio of unique to total words: \code{unique_words} / \code{words}.
#'     \item \strong{\code{reading_grade}}: Flesch-Kincaid grade level: .39 * \code{words} / \code{sentences} +
#'       11.8 * \code{syllables} / \code{words} - 15.59.
#'     \item \strong{\code{numbers}}: Number of terms starting with numbers.
#'     \item \strong{\code{punct}}: Number of terms starting with non-alphanumeric characters.
#'     \item \strong{\code{periods}}: Number of periods.
#'     \item \strong{\code{commas}}: Number of commas.
#'     \item \strong{\code{qmarks}}: Number of question marks.
#'     \item \strong{\code{exclams}}: Number of exclamation points.
#'     \item \strong{\code{quotes}}: Number of quotation marks (single and double).
#'     \item \strong{\code{apostrophes}}: Number of apostrophes, defined as any modified letter apostrophe, or backtick
#'       or single straight or curly quote surrounded by letters.
#'     \item \strong{\code{brackets}}: Number of bracketing characters (including parentheses, and square,
#'       curly, and angle brackets).
#'     \item \strong{\code{orgmarks}}: Number of characters used for organization or structuring (including
#'       dashes, foreword slashes, colons, and semicolons).
#'   }
#' @examples
#' text <- c(
#'   succinct = "It is here.",
#'   verbose = "Hear me now. I shall tell you about it. It is here. Do you hear?",
#'   couched = "I might be wrong, but it seems to me that it might be here.",
#'   bigwords = "Object located thither.",
#'   excited = "It's there! It's there! It's there!",
#'   drippy = "It's 'there', right? Not 'here'? 'there'? Are you Sure?",
#'   struggly = "It's here -- in that place where it is. Like... the 1st place (here)."
#' )
#' lma_meta(text)
#' @export

lma_meta <- function(text) {
  text <- gsub("^\\s+|\\s+$", "", text)
  dtm <- lma_dtm(text, numbers = TRUE, punct = TRUE, urls = FALSE)
  text <- gsub(paste0(
    "((?:^|\\s)[a-z]+\\.[a-z.]+|\\d|(?:^|\\s)[a-z]|(?:^|\\s)[iv]+|",
    "ans|govt|apt|etc|st|rd|ft|feat|dr|drs|mr|ms|mrs|messrs|jr|prof)\\."
  ), "", text, TRUE)
  terms <- colnames(dtm)
  dwm <- dtm[, grepl("^[a-z']", terms), drop = FALSE]
  words <- colnames(dwm)
  word_lengths <- nchar(words)
  word_syllables <- vapply(strsplit(words, "a+[eu]*|e+a*|i+|o+[ui]*|u+|y+[aeiou]*"), length, 0) - 1
  word_syllables[word_syllables == 0] <- 1
  res <- data.frame(
    characters = as.numeric(dwm %*% word_lengths),
    syllables = as.numeric(dwm %*% word_syllables),
    words = rowSums(dwm),
    unique_words = rowSums(dwm != 0),
    clauses = vapply(strsplit(text, '([.?!\n,:;)}>-]|\\])([.?!\n,:;)}>\n\'"-]|\\s|\\])*'), length, 0),
    sentences = vapply(strsplit(text, '[.?!\n]([.?!\n\'"]|\\s)*'), length, 0),
    stringsAsFactors = FALSE
  )
  cbind(res, with(res, data.frame(
    words_per_clause = words / clauses,
    words_per_sentence = words / sentences,
    sixltr = as.numeric(dwm %*% (word_lengths > 5)),
    characters_per_word = characters / words,
    syllables_per_word = syllables / words,
    type_token_ratio = unique_words / words,
    reading_grade = .39 * words / sentences + 11.8 * syllables / words - 15.59,
    numbers = if (any(su <- grepl("^[0-9]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
    puncts = if (any(su <- grepl("^[^a-z0-9]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
    periods = if ("." %in% terms) dtm[, "."] else 0,
    commas = if ("," %in% terms) dtm[, ","] else 0,
    qmarks = if ("?" %in% terms) dtm[, "?"] else 0,
    exclams = if ("!" %in% terms) dtm[, "!"] else 0,
    quotes = if (any(su <- grepl('^[\'"]', terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
    apostrophes = vapply(strsplit(text, "[\u02bc]+|[a-zA-Z][\u0027\u0060\u2019]+[a-zA-Z]"), length, 0) - 1,
    brackets = if (any(su <- grepl("[(\\)<>{\\}[]|\\]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
    orgmarks = if (any(su <- grepl("[/:;-]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
    row.names = seq_len(nrow(res)), stringsAsFactors = FALSE
  )))
}

#' Document-Term Matrix Weighting
#'
#' Weight a document-term matrix.
#' @param dtm A matrix with words as column names.
#' @param weight A string referring at least partially to one (or a combination; see note) of the
#'   available weighting methods:
#'
#'   \strong{Term weights} (applied uniquely to each cell)
#'   \itemize{
#'     \item \strong{\code{binary}} \cr
#'     \code{(dtm > 0) * 1} \cr
#'     Convert frequencies to 1s and 0s; remove differences in frequencies.
#'
#'     \item \strong{\code{log}} \cr
#'     \code{log(dtm + 1, log.base)} \cr
#'     Log of frequencies.
#'
#'     \item \strong{\code{sqrt}} \cr
#'     \code{sqrt(dtm)} \cr
#'     Square root of frequencies.
#'
#'     \item \strong{\code{count}} \cr
#'     \code{dtm} \cr
#'     Unaltered; sometimes called term frequencies (tf).
#'
#'     \item \strong{\code{amplify}} \cr
#'     \code{dtm ^ alpha} \cr
#'     Amplify difference in frequencies.
#'   }
#'
#'   \strong{Document weights} (applied by column)
#'   \itemize{
#'     \item \strong{\code{dflog}} \cr
#'     \code{log(colSums(dtm > 0), log.base)} \cr
#'     Log of binary term sum.
#'
#'     \item \strong{\code{entropy}} \cr
#'     \code{1 - rowSums(x *} \code{log(x + 1, log.base) /} \code{log(ncol(x), log.base),} \code{na.rm = TRUE)} \cr
#'     Where \code{x = t(dtm) / colSums(dtm > 0)}; entropy of term-conditional term distribution.
#'
#'     \item \strong{\code{ppois}} \cr
#'     \code{1 - ppois(pois.x,} \code{colSums(dtm) / nrow(dtm))} \cr
#'     Poisson-predicted term distribution.
#'
#'     \item \strong{\code{dpois}} \cr
#'     \code{1 - dpois(pois.x, colSums(dtm) / nrow(dtm))} \cr
#'     Poisson-predicted term density.
#'
#'     \item \strong{\code{dfmlog}} \cr
#'     \code{log(diag(dtm[max.col(t(dtm)), ]), log.base)} \cr
#'     Log of maximum term frequency.
#'
#'     \item \strong{\code{dfmax}} \cr
#'     \code{diag(dtm[max.col(t(dtm)), ])} \cr
#'     Maximum term frequency.
#'
#'     \item \strong{\code{df}} \cr
#'     \code{colSums(dtm > 0)} \cr
#'     Sum of binary term occurrence across documents.
#'
#'     \item \strong{\code{idf}} \cr
#'     \code{log(nrow(dtm) / colSums(dtm > 0), log.base)} \cr
#'     Inverse document frequency.
#'
#'     \item \strong{\code{ridf}} \cr
#'     \code{idf - log(dpois, log.base)} \cr
#'     Residual inverse document frequency.
#'
#'     \item \strong{\code{normal}} \cr
#'     \code{sqrt(1 / colSums(dtm ^ 2))} \cr
#'     Normalized document frequency.
#'   }
#'
#' Alternatively, \code{'pmi'} or \code{'ppmi'} will apply a pointwise mutual information weighting
#' scheme (with \code{'ppmi'} setting negative values to 0).
#' @param normalize Logical: if \code{FALSE}, the dtm is not divided by document word-count before
#'   being weighted.
#' @param wc.complete If the dtm was made with \code{\link{lma_dtm}} (has a \code{'WC'}
#'   attribute), word counts for
#'   frequencies can be based on the raw count (default; \code{wc.complete = TRUE}). If
#'   \code{wc.complete = FALSE}, or the dtm does not have a \code{'WC'} attribute,
#'   \code{rowSums(dtm)} is used as word count.
#' @param log.base The base of logs, applied to any weight using \code{\link[base]{log}}.
#'   Default is 10.
#' @param alpha A scaling factor applied to document frequency as part of pointwise mutual
#'   information weighting, or amplify's power (\code{dtm ^ alpha}, which defaults to 1.1).
#' @param pois.x integer; quantile or probability of the poisson distribution (\code{dpois(pois.x,
#'   colSums(x,} \code{na.rm = TRUE) / nrow(x))}).
#' @param doc.only Logical: if \code{TRUE}, only document weights are returned (a single value for
#'   each term).
#' @param percent Logical; if \code{TRUE}, frequencies are multiplied by 100.
#' @note
#' Term weights works to adjust differences in counts within documents, with differences meaning
#' increasingly more from \code{binary} to \code{log} to \code{sqrt} to \code{count} to \code{amplify}.
#'
#' Document weights work to treat words differently based on their between-document or overall frequency.
#' When term frequencies are constant, \code{dpois}, \code{idf}, \code{ridf}, and \code{normal} give
#' less common words increasingly more weight, and \code{dfmax}, \code{dfmlog}, \code{ppois}, \code{df},
#' \code{dflog}, and \code{entropy} give less common words increasingly less weight.
#'
#' \code{weight} can either be a vector with two characters, corresponding to term weight and
#' document weight (e.g., \code{c('count', 'idf')}), or it can be a string with term and
#' document weights separated by any of \code{:\\*_/; ,-} (e.g., \code{'count-idf'}).
#' \code{'tf'} is also acceptable for \code{'count'}, and \code{'tfidf'} will be parsed as
#' \code{c('count', 'idf')}, though this is a special case.
#'
#' For \code{weight}, term or document weights can be entered individually; term weights alone will
#' not apply any document weight, and document weights alone will apply a \code{'count'} term weight
#' (unless \code{doc.only = TRUE}, in which case a term-named vector of document weights is returned
#' instead of a weighted dtm).
#' @return A weighted version of \code{dtm}, with a \code{type} attribute added (\code{attr(dtm, 'type')}).
#' @examples
#' # visualize term and document weights
#'
#' ## term weights
#' term_weights <- c("binary", "log", "sqrt", "count", "amplify")
#' Weighted <- sapply(term_weights, function(w) lma_weight(1:20, w, FALSE))
#' if (require(splot)) splot(Weighted ~ 1:20, labx = "Raw Count", lines = "co")
#'
#' ## document weights
#' doc_weights <- c(
#'   "df", "dflog", "dfmax", "dfmlog", "idf", "ridf",
#'   "normal", "dpois", "ppois", "entropy"
#' )
#' weight_range <- function(w, value = 1) {
#'   m <- diag(20)
#'   m[upper.tri(m, TRUE)] <- if (is.numeric(value)) {
#'     value
#'   } else {
#'     unlist(lapply(
#'       1:20, function(v) rep(if (value == "inverted") 21 - v else v, v)
#'     ))
#'   }
#'   lma_weight(m, w, FALSE, doc.only = TRUE)
#' }
#'
#' if (require(splot)) {
#'   category <- rep(c("df", "idf", "normal", "poisson", "entropy"), c(4, 2, 1, 2, 1))
#'   op <- list(
#'     laby = "Relative (Scaled) Weight", labx = "Document Frequency",
#'     leg = "outside", lines = "connected", mv.scale = TRUE, note = FALSE
#'   )
#'   splot(
#'     sapply(doc_weights, weight_range) ~ 1:20,
#'     options = op, title = "Same Term, Varying Document Frequencies",
#'     sud = "All term frequencies are 1.",
#'     colorby = list(category, grade = TRUE)
#'   )
#'   splot(
#'     sapply(doc_weights, weight_range, value = "sequence") ~ 1:20,
#'     options = op, title = "Term as Document Frequencies",
#'     sud = "Non-zero terms are the number of non-zero terms.",
#'     colorby = list(category, grade = TRUE)
#'   )
#'   splot(
#'     sapply(doc_weights, weight_range, value = "inverted") ~ 1:20,
#'     options = op, title = "Term Opposite of Document Frequencies",
#'     sud = "Non-zero terms are the number of zero terms + 1.",
#'     colorby = list(category, grade = TRUE)
#'   )
#' }
#'
#' @export

lma_weight <- function(dtm, weight = "count", normalize = TRUE, wc.complete = TRUE,
                       log.base = 10, alpha = 1, pois.x = 1L, doc.only = FALSE, percent = FALSE) {
  if (is.null(dim(dtm))) dtm <- if (is.character(dtm) || is.factor(dtm)) lma_dtm(dtm) else matrix(dtm, 1)
  ck <- attr(dtm, "type")
  if (!is.null(ck) && length(ck) == 3 && (ck[1] == "TRUE" || ck[2] != "count" || ck[3] != "NA")) {
    message(
      "the entered dtm appears to already be weighted (", paste(ck[2:3], collapse = "*"),
      "), so it will not be altered"
    )
    return(dtm)
  }
  weight <- tolower(weight)
  if (missing(normalize) && any(grepl("pmi", weight))) normalize <- FALSE
  if (normalize) {
    wc <- attr(dtm, "WC")
    if (is.null(wc) || !wc.complete || nrow(dtm) != length(wc)) wc <- rowSums(dtm, na.rm = TRUE)
    adj <- if (percent) 100 else 1
    if (.hasSlot(dtm, "x") && .hasSlot(dtm, "i")) {
      wc <- wc[dtm@i + 1]
      su <- wc != 0
      dtm@x[su] <- dtm@x[su] / wc[su] * adj
    } else {
      su <- wc != 0
      dtm[su, ] <- dtm[su, ] / wc[su] * adj
    }
  }
  nr <- nrow(dtm)
  if (any(grepl("pmi", weight))) {
    tw <- dw <- "pmi"
    if (missing(log.base)) log.base <- 2
    twc <- sum(dtm, na.rm = TRUE)
    pc <- matrix(colSums(dtm^alpha, na.rm = TRUE) / twc^alpha, 1)
    dtm <- dtm / twc
    dtm <- dtm / rowSums(dtm, na.rm = TRUE) %*% pc
    if (.hasSlot(dtm, "x")) {
      dtm@x <- log(dtm@x, base = log.base)
      dtm@x[!is.finite(dtm@x)] <- 0
    } else {
      dtm <- log(dtm, base = log.base)
      dtm[!is.finite(dtm)] <- 0
    }
    if (any(grepl("pp", weight))) {
      tw <- dw <- "ppmi"
      dtm[dtm < 0] <- 0
    }
  } else {
    term <- function(x, type) {
      switch(type,
        binary = (x > 0) * 1,
        log = log(x + 1, base = log.base),
        sqrt = sqrt(x),
        count = x,
        amplify = x^alpha
      )
    }
    doc <- function(x, type) {
      d <- switch(type,
        df = colSums(x > 0, na.rm = TRUE),
        dflog = log(colSums(x > 0, na.rm = TRUE), base = log.base),
        dfmax = diag(x[max.col(t(x)), ]),
        dfmlog = log(diag(x[max.col(t(x)), ]), base = log.base),
        idf = log(nrow(x) / colSums(x > 0, na.rm = TRUE), base = log.base),
        normal = sqrt(1 / colSums(x^2, na.rm = TRUE)),
        dpois = 1 - dpois(pois.x, colSums(x, na.rm = TRUE) / nrow(x)),
        ppois = 1 - ppois(pois.x, colSums(x, na.rm = TRUE) / nrow(x)),
        ridf = doc(x, "idf") - log(doc(x, "dpois"), base = log.base),
        entropy = {
          x <- t(x) / colSums(x > 0, na.rm = TRUE)
          1 - rowSums(x * log(x + 1, base = log.base) /
            log(ncol(x), base = log.base), na.rm = TRUE)
        }
      )
      d[!is.finite(d)] <- 0
      d
    }
    if (length(weight) == 1) {
      weight <- strsplit(weight, " *[:\\*_/; ,-] *")[[1]]
      if (length(weight) == 1 && weight == "tfidf") weight <- c("count", "idf")
    }
    if (grepl("^(?:t|na|non|f)", weight[1])) weight[1] <- "count"
    tws <- c("binary", "log", "sqrt", "count", "amplify")
    tw <- if (weight[1] == "") "count" else grep(substr(weight[1], 0, 4), tws, value = TRUE)[1]
    pdw <- TRUE
    dws <- c("df", "dflog", "dfmax", "dfmlog", "idf", "normal", "dpois", "ppois", "ridf", "entropy")
    if (is.na(tw)) {
      tw <- grep(substr(weight[1], 0, 4), dws, value = TRUE)[1]
      if (!is.na(tw)) {
        pdw <- FALSE
        if (!doc.only) {
          dw <- tw
          tw <- "count"
        } else {
          return(doc(dtm, tw))
        }
      } else {
        stop(paste(weight, collapse = " * "), " is not a recognized weight", call. = FALSE)
      }
    }
    if (pdw) dw <- if (length(weight) > 1) grep(substr(weight[2], 0, 4), dws, value = TRUE)[1] else "none"
    if (is.na(dw)) dw <- "none"
    if (missing(alpha) && tw == "amplify") alpha <- 1.1
    dtm <- if (dw == "none") term(dtm, tw) else term(dtm, tw) * rep(doc(dtm, dw), each = nr)
  }
  attr(dtm, "type") <- c(normalized = normalize, term = tw, document = dw)
  dtm
}

#' Latent Semantic Space (Embeddings) Operations
#'
#' Map a document-term matrix onto a latent semantic space, extract terms from a
#' latent semantic space (if \code{dtm} is a character vector, or \code{map.space =} \code{FALSE}),
#' or perform a singular value decomposition of a document-term matrix (if \code{dtm} is a matrix
#' and \code{space} is missing).
#' @param dtm A matrix with terms as column names, or a character vector of terms to be extracted
#'   from a specified space. If this is of length 1 and \code{space} is missing, it will be treated
#'   as \code{space}.
#' @param space A matrix with terms as rownames. If missing, this will be the right singular vectors
#'   of a singular value decomposition of \code{dtm}. If a character, a file matching the character
#'   will be searched for in \code{dir} (e.g., \code{space = 'google'}). If a file is not found and
#'   the character matches one of the \href{https://osf.io/489he/wiki/home}{available spaces}, you
#'   will be given the option to download it, as handled by \code{\link{download.lspace}}.
#'   If \code{dtm} is missing, the entire space will be loaded and returned.
#' @param map.space Logical: if \code{FALSE}, the original vectors of \code{space} for terms
#'   found in \code{dtm} are returned. Otherwise \code{dtm} \code{\%*\%} \code{space} is returned,
#'   excluding uncommon columns of \code{dtm} and rows of \code{space}.
#' @param fill.missing Logical: if \code{TRUE} and terms are being extracted from a space, includes
#'   terms not found in the space as rows of 0s, such that the returned matrix will have a row
#'   for every requested term.
#' @param term.map A matrix with \code{space} as a column name, terms as row names, and indices of
#'   the terms in the given space as values, or a numeric vector of indices with terms as names, or
#'   a character vector of terms corresponding to rows of the space. This is used instead of reading
#'   in an "_terms.txt" file corresponding to a \code{space} entered as a character (the name of a
#'   space file).
#' @param dim.cutoff If a \code{space} is calculated, this will be used to decide on the number of
#'   dimensions to be retained: \code{cumsum(d) / sum(d) < dim.cutoff}, where \code{d} is a vector
#'   of singular values of \code{dtm} (i.e., \code{svd(dtm)$d}). The default is \code{.5}; lower
#'   cutoffs result in fewer dimensions.
#' @param keep.dim Logical: if \code{TRUE}, and a space is being calculated from the input, a matrix
#'   in the same dimensions as \code{dtm} is returned. Otherwise, a matrix with terms as rows and
#'   dimensions as columns is returned.
#' @param use.scan Logical: if \code{TRUE}, reads in the rows of \code{space} with \code{\link{scan}}.
#' @param dir Path to a folder containing spaces. \cr
#'   Set a session default with \code{options(lingmatch.lspace.dir = 'desired/path')}.
#' @note
#' A traditional latent semantic space is a selection of right singular vectors from the singular
#' value decomposition of a dtm (\code{svd(dtm)$v[, 1:k]}, where \code{k} is the selected number of
#' dimensions, decided here by \code{dim.cutoff}).
#'
#' Mapping a new dtm into a latent semantic space consists of multiplying common terms:
#' \code{dtm[, ct]} \code{\%*\% space[ct, ]}, where \code{ct} \code{=} \code{colnames(dtm)[colnames(dtm)}
#' \code{\%in\%} \code{rownames(space)]} -- the terms common between the dtm and the space. This
#' results in a matrix with documents as rows, and dimensions as columns, replacing terms.
#' @family Latent Semantic Space functions
#' @return A matrix or sparse matrix with either (a) a row per term and column per latent dimension (a latent
#' space, either calculated from the input, or retrieved when \code{map.space = FALSE}), (b) a row per document
#' and column per latent dimension (when a dtm is mapped to a space), or (c) a row per document and
#' column per term (when a space is calculated and \code{keep.dim = TRUE}).
#' @examples
#' text <- c(
#'   paste(
#'     "Hey, I like kittens. I think all kinds of cats really are just the",
#'     "best pet ever."
#'   ),
#'   paste(
#'     "Oh year? Well I really like cars. All the wheels and the turbos...",
#'     "I think that's the best ever."
#'   ),
#'   paste(
#'     "You know what? Poo on you. Cats, dogs, rabbits -- you know, living",
#'     "creatures... to think you'd care about anything else!"
#'   ),
#'   paste(
#'     "You can stick to your opinion. You can be wrong if you want. You know",
#'     "what life's about? Supercharging, diesel guzzling, exhaust spewing,",
#'     "piston moving ignitions."
#'   )
#' )
#'
#' dtm <- lma_dtm(text)
#'
#' # calculate a latent semantic space from the example text
#' lss <- lma_lspace(dtm)
#'
#' # show that document similarities between the truncated and full space are the same
#' spaces <- list(
#'   full = lma_lspace(dtm, keep.dim = TRUE),
#'   truncated = lma_lspace(dtm, lss)
#' )
#' sapply(spaces, lma_simets, metric = "cosine")
#'
#' \dontrun{
#'
#' # specify a directory containing spaces,
#' # or where you would like to download spaces
#' space_dir <- "~/Latent Semantic Spaces"
#'
#' # map to a pretrained space
#' ddm <- lma_lspace(dtm, "100k", dir = space_dir)
#'
#' # load the matching subset of the space
#' # without mapping
#' lss_100k_part <- lma_lspace(colnames(dtm), "100k", dir = space_dir)
#'
#' ## or
#' lss_100k_part <- lma_lspace(dtm, "100k", map.space = FALSE, dir = space_dir)
#'
#' # load the full space
#' lss_100k <- lma_lspace("100k", dir = space_dir)
#'
#' ## or
#' lss_100k <- lma_lspace(space = "100k", dir = space_dir)
#' }
#' @export

lma_lspace <- function(dtm = "", space, map.space = TRUE, fill.missing = FALSE, term.map = NULL,
                       dim.cutoff = .5, keep.dim = FALSE, use.scan = FALSE, dir = getOption("lingmatch.lspace.dir")) {
  if (ckd <- dir == "") dir <- "~/Latent Semantic Spaces"
  if ((is.character(dtm) || is.factor(dtm)) && missing(space)) {
    if (length(dtm) > 1 && any(grepl(" ", dtm, fixed = TRUE))) {
      dtm <- lma_dtm(dtm)
    } else if (length(dtm) == 1 && dtm != "") {
      if (missing(use.scan)) use.scan <- TRUE
      space <- dtm
      dtm <- ""
    }
  }
  if (is.data.frame(dtm)) dtm <- as.matrix(dtm)
  if (missing(space)) {
    nr <- nrow(dtm)
    if (is.null(nr)) stop("enter a matrix for dtm, or specify a space")
    s <- svd(dtm)
    s$v <- t(s$v)
    k <- cumsum(s$d) / sum(s$d)
    if (dim.cutoff > 1) dim.cutoff <- 1
    k <- if (length(k) == 1) 1 else seq_len(if (any(k < dim.cutoff)) which(k >= dim.cutoff)[1] else 1)
    if (keep.dim) {
      dtm[] <- s$u[, k] %*% (if (length(k) == 1) matrix(s$d[k]) else diag(s$d[k])) %*% s$v[k, ]
    } else {
      cn <- colnames(dtm)
      dtm <- t(s$v[k, , drop = FALSE])
      rownames(dtm) <- cn
    }
  } else {
    terms <- if (is.null(colnames(dtm))) {
      map.space <- FALSE
      dtm
    } else {
      colnames(dtm)
    }
    if (is.character(space)) {
      if (space == "default") space <- "100k_lsa"
      name <- gsub("^.*[/\\]|\\.[^/\\]*$", "", space)[1]
      spaces <- list.files(dir)
      ts <- grep(space, spaces, fixed = TRUE, value = TRUE)
      if (!length(ts)) {
        ts <- rownames(select.lspace(name)$selected)[1]
        if (!ckd && !is.na(ts) && grepl("^$|^[yt1]|^ent", readline(paste0(
          "would you like to download the ", ts, " space? (press Enter for yes): "
        )))) {
          download.lspace(ts, dir = dir)
          ts <- paste0(ts, ".dat")
        } else {
          stop("space (", space, ") not found in dir (", dir, ")",
            if (ckd) '\nspecify a directory (e.g., dir = "~") to locate or download; see ?download.lspace',
            call. = FALSE
          )
        }
      }
      space_path <- normalizePath(paste0(dir, "/", if (length(su <- grep("\\.dat$", ts))) {
        ts[su[[1]]]
      } else {
        use.scan <- TRUE
        ts[grep("[bgx]z[ip2]*$", ts)[[1]]]
      }), "/", FALSE)
      name <- sub("\\.[^.]*$", "", basename(space_path))
      if (name %in% colnames(term.map)) term.map <- term.map[term.map[, name] != 0, name]
      rex <- function(inds, f) {
        nc <- length(strsplit(readLines(f, 1), "\\s+")[[1]])
        l <- length(inds)
        all <- all(seq_len(l) == inds)
        r <- matrix(0, l, nc)
        i <- 1
        con <- file(f, "r")
        on.exit(close(con))
        while (i <= l) {
          if (all) {
            n <- l
          } else {
            n <- 1
            while (i + n < l && inds[i + n - 1] == inds[i + n] - 1) n <- n + 1
          }
          r[seq_len(n) + i - 1, ] <- matrix(scan(
            con,
            n = n * nc, quiet = TRUE, skip = (if (i == 1) {
              inds[i]
            } else {
              inds[i] - inds[i - 1]
            }) - 1, quote = "", comment.char = "", na.strings = ""
          ), n, nc, byrow = TRUE)
          i <- i + n
        }
        r
      }
      if (is.null(term.map)) {
        terms_file <- paste0(dir, "/", name, "_terms.txt")
        if (!file.exists(terms_file)) stop("terms file (", space, "_terms.txt) not found in dir (", dir, ").")
        space_terms <- readLines(terms_file)
        su <- if (length(terms) == 1 && terms == "") {
          terms <- space_terms
          !logical(length(space_terms))
        } else {
          space_terms %in% terms
        }
        if (sum(su) < length(terms)) {
          lsterms <- tolower(space_terms)
          su2 <- !duplicated(lsterms) & lsterms %in% terms[!terms %in% space_terms[su]]
          if (any(su2)) {
            space_terms[su2] <- lsterms[su2]
            su <- su | su2
          }
        }
        if (sum(su)) {
          space <- if (use.scan) rex(which(su), space_path) else t(extract_indices(which(su), space_path))
          rownames(space) <- space_terms[su]
          ts <- terms[terms %in% rownames(space)]
          space <- space[ts, , drop = FALSE]
        } else {
          stop("no matching terms in space ", space)
        }
      } else {
        if (is.character(term.map)) term.map <- structure(seq_along(term.map), names = term.map)
        su <- which(names(term.map) %in% terms)
        inds <- as.numeric(sort(if (length(terms) == 1 && terms == "") term.map else term.map[su]))
        if (length(inds)) {
          space <- if (use.scan) rex(inds, space_path) else t(extract_indices(inds, space_path))
          rownames(space) <- ts <- names(term.map)[inds]
        } else {
          stop("no matching terms in space ", space)
        }
      }
    } else {
      if (is.data.frame(space)) space <- as.matrix(space)
      name <- deparse(substitute(space))
      su <- terms %in% rownames(space)
      if (sum(su)) {
        ts <- terms[su]
        space <- space[ts, , drop = FALSE]
      } else if (sum(su <- terms %in% colnames(space))) {
        ts <- terms[su]
        space <- t(space[, ts, drop = FALSE])
      } else {
        stop("no matching terms in provided space")
      }
    }
    if (fill.missing) {
      su <- which(!terms %in% rownames(space))
      if (length(su)) {
        space <- rbind(space, matrix(0, length(su), ncol(space), dimnames = list(terms[su])))
        space <- space[terms, ]
      }
      ts <- rownames(space)
    }
    attr(space, "space") <- name
    if (map.space) {
      rep <- length(ts) / ncol(dtm)
      if (rep < .2) {
        warning(paste0(
          "only ", round(rep * 100, 2), "% of dtm terms appear in the provided space; ",
          "you might consider using a different source or cleaning/partial matching terms"
        ), call. = FALSE)
      }
      dtm <- dtm[, ts, drop = FALSE] %*% space
      attr(dtm, "space") <- name
    } else {
      return(space)
    }
  }
  dtm
}

#' Assess Dictionary Categories Within a Latent Semantic Space
#'
#'
#' @param dict A vector of terms, list of such vectors, or a matrix-like object to be
#' categorized by \code{\link{read.dic}}.
#' @param space A vector space used to calculate similarities between terms.
#' Names of spaces (see \code{\link{select.lspace}}), a matrix with terms as row names, or
#' \code{"auto"} to auto-select a space based on matched terms. This can also be \code{multi}
#' to use multiple spaces, which are combined after similarities are calculated.
#' @param n_spaces Number of spaces to draw from if \code{space} is \code{multi}.
#' @param suggest Logical; if \code{TRUE}, will search for other terms for possible inclusion
#' in \code{space}.
#' @param suggestion_terms Number of terms to use when selecting suggested additions.
#' @param suggest_stopwords Logical; if \code{TRUE}, will suggest function words.
#' @param suggest_discriminate Logical; if \code{TRUE}, will adjust for similarity to other
#' categories when finding suggestions.
#' @param expand_cutoff_freq Proportion of mapped terms to include when expanding dictionary terms.
#' Applies when \code{space} is a character (referring to a space to be loaded).
#' @param expand_cutoff_spaces Number of spaces in which a term has to appear to be considered
#' for expansion. Applies when \code{space} is a character (referring to a space to be loaded).
#' @param dimension_prop Proportion of dimensions to use when searching for suggested additions,
#' where less than 1 will calculate similarities to the category core using fewer dimensions
#' of the space.
#' @param pairwise Logical; if \code{FALSE}, will compare candidate suggestion terms with a single,
#' averaged category vector rather than all category terms separately.
#' @param glob Logical; if \code{TRUE}, converts globs (asterisk wildcards) to regular expressions.
#' @param space_dir Directory from which \code{space} should be loaded.
#' @param verbose Logical; if \code{FALSE}, will not show status messages.
#' @family Dictionary functions
#' @seealso
#' To just expand fuzzy terms, see \code{\link{report_term_matches}()}.
#'
#' Similar information is provided in the \href{https://miserman.github.io/dictionary_builder/}{dictionary builder} web tool.
#' @returns A list:
#' \itemize{
#'   \item \strong{\code{expanded}}: A version of \code{dict} with fuzzy terms expanded.
#'   \item \strong{\code{summary}}: A summary of each dictionary category.
#'   \item \strong{\code{terms}}: Match (expanded term) similarities within terms and categories.
#'   \item \strong{\code{suggested}}: If \code{suggest} is \code{TRUE}, a list with suggested
#'   additions for each dictionary category. Each entry is a named numeric vector with
#'   similarities for each suggested term.
#' }
#' @examples
#' if (dir.exists("~/Latent Semantic Spaces")) {
#'   dict <- list(
#'     furniture = c("table", "chair", "desk*", "couch*", "sofa*"),
#'     well_adjusted = c("happy", "bright*", "friend*", "she", "he", "they")
#'   )
#'   dictionary_meta(dict, space_dir = "~/Latent Semantic Spaces")
#' }
#' @export

dictionary_meta <- function(
    dict, space = "auto", n_spaces = 5, suggest = FALSE, suggestion_terms = 10, suggest_stopwords = FALSE,
    suggest_discriminate = TRUE, expand_cutoff_freq = .98, expand_cutoff_spaces = 10,
    dimension_prop = 1, pairwise = TRUE, glob = TRUE, space_dir = getOption("lingmatch.lspace.dir"), verbose = TRUE) {
  if (missing(dict)) stop("dict must be specified", call. = FALSE)
  if (!is.list(dict)) dict <- list(dict)
  if (is.null(names(dict))) names(dict) <- paste0("cat", seq_along(dict))
  st <- proc.time()[[3]]
  if (verbose) cat("preparing terms (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
  terms <- data.frame(
    category = rep(names(dict), vapply(dict, length, 0)), term = unlist(dict), stringsAsFactors = FALSE
  )
  rownames(terms) <- NULL
  terms$regex <- paste0("\\b", to_regex(list(terms$term), TRUE, glob)[[1]], "\\b")
  if (is.character(space)) {
    term_map <- select.lspace(dir = space_dir, get.map = TRUE)$term_map
    if (is.null(term_map)) {
      stop(
        "term map not found; specify `space_dir` or provide text",
        call. = FALSE
      )
    }
    if (expand_cutoff_freq > 0 && expand_cutoff_freq < 1) {
      term_map <- term_map[seq(1, ceiling(nrow(term_map) * expand_cutoff_freq)), , drop = FALSE]
    }
    if (expand_cutoff_spaces > 0 && expand_cutoff_spaces < ncol(term_map)) {
      term_map <- term_map[rowSums(term_map != 0) >= expand_cutoff_spaces, , drop = FALSE]
    }
    if (!grepl("^(?:auto|multi)", space[[1]], TRUE)) {
      space <- space[space %in% colnames(term_map)]
      if (!length(space)) stop("`space` not found in `term_map` colnames", call. = FALSE)
      term_map <- term_map[rowSums(term_map[, space, drop = FALSE] != 0) != 0, , drop = FALSE]
    }
    space_terms <- rownames(term_map)
  } else {
    space_terms <- rownames(space)
    if (is.null(space_terms)) {
      stop(
        "for space, enter a name or matrix-like object with terms as rownames",
        call. = FALSE
      )
    }
  }
  if (verbose) cat("expanding terms (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
  matches <- extract_matches(terms$regex, paste(space_terms, collapse = "  "), TRUE)
  matched_terms <- unique(unlist(lapply(matches, names), use.names = FALSE))
  if (!length(matched_terms)) stop("no `dict` terms matched any space terms", call. = FALSE)
  multi <- FALSE
  if (is.character(space)) {
    if (length(space) == 1 && !missing(n_spaces) && n_spaces > 1) space <- "multi"
    multi <- grepl("^multi", space[[1]], TRUE)
    if (length(space) > 1 || multi) {
      if (length(space) == 1 && multi) {
        term_map_matched <- term_map[rownames(term_map) %in% matched_terms, , drop = FALSE]
        commonness <- sort(-rowSums(term_map_matched != 0))
        common_terms <- names(commonness[commonness >= min(max(commonness), n_spaces)])
        space <- names(sort(-colSums(term_map_matched[common_terms, , drop = FALSE])))[
          seq_len(max(1, min(nrow(term_map), n_spaces)))
        ]
      }
      multi <- TRUE
      term_map <- term_map[, space, drop = FALSE]
      space_terms <- rownames(term_map)[rowSums(term_map != 0) == length(space)]
      space_name <- paste(space, collapse = ", ")
      if (verbose) cat("loading spaces (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
      space <- lapply(space, function(s) {
        lma_lspace(
          if (suggest) space_terms else matched_terms[matched_terms %in% space_terms], s,
          dir = space_dir
        )
      })
      matches <- lapply(matches, function(l) l[names(l) %in% space_terms])
      matched_terms <- unique(unlist(lapply(matches, names), use.names = FALSE))
    } else {
      if (space == "auto") {
        space <- colnames(term_map)[
          which.max(colSums(term_map[rownames(term_map) %in% matched_terms, ] != 0))
        ]
      }
      space_name <- space
      space_terms <- rownames(term_map)[term_map[, space] != 0]
      if (verbose) cat("loading space (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
      space <- lma_lspace(
        if (suggest) space_terms else matched_terms[matched_terms %in% space_terms], space,
        dir = space_dir
      )
      matches <- lapply(matches, function(l) l[names(l) %in% space_terms])
      matched_terms <- unique(unlist(lapply(matches, names), use.names = FALSE))
    }
  } else {
    if (is.data.frame(space)) space <- as.matrix(space)
    space_name <- "custom"
  }
  cat_names <- structure(names(dict), names = names(dict))
  dict_exp <- lapply(cat_names, function(cat) {
    unique(names(unlist(matches[terms$category == cat])))
  })
  if (multi) {
    if (!suggest) space_terms <- rownames(space[[1]])
    if (verbose) cat("calculating term similarities (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
    sims <- lapply(cat_names, function(cat) {
      su <- space_terms %in% dict_exp[[cat]]
      if (any(su)) {
        aggsim <- NULL
        for (i in seq_along(space)) {
          s <- space[[i]]
          if (dimension_prop < 1) {
            loadings <- colMeans(s[su, , drop = FALSE])
            dsu <- order(-loadings)[seq(1, max(1, ceiling(ncol(s) * dimension_prop)))]
            s <- s[, dsu, drop = FALSE]
          }
          if (pairwise) {
            sim <- lma_simets(s, s[su, ], metric = "cosine", pairwise = TRUE, symmetrical = TRUE)
            if (is.null(dim(sim))) sim <- as(t(t(sim)), "CsparseMatrix")
            diag(sim[su, , drop = FALSE]) <- 0
            ms <- min(sim)
            sim <- (sim - ms) / (max(sim) - ms) * sign(sim)
            diag(sim[su, , drop = FALSE]) <- 1
          } else {
            sim <- lma_simets(s, colMeans(s[su, , drop = FALSE]), metric = "cosine", pairwise = TRUE, symmetrical = TRUE)
          }
          if (i == 1) {
            aggsim <- sim
          } else {
            aggsim <- aggsim + sim
          }
        }
        if (is.null(dim(aggsim))) aggsim <- as(t(t(aggsim)), "CsparseMatrix")
        aggsim / length(space)
      }
    })
  } else {
    space <- as(space, "CsparseMatrix")
    if (!suggest) space_terms <- rownames(space)
    if (verbose) cat("calculating term similarities (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
    sims <- lapply(cat_names, function(cat) {
      su <- space_terms %in% dict_exp[[cat]]
      if (any(su)) {
        if (dimension_prop < 1) {
          loadings <- colSums(space[su, , drop = FALSE])
          dsu <- order(loadings, decreasing = TRUE)[seq(1, max(1, ceiling(ncol(space) * dimension_prop)))]
          space <- space[, dsu, drop = FALSE]
        }
        sim <- lma_simets(
          space, if (pairwise) space[su, , drop = FALSE] else colMeans(space[su, , drop = FALSE]),
          metric = "cosine", pairwise = TRUE, symmetrical = TRUE
        )
        if (is.null(dim(sim))) sim <- as(t(t(sim)), "CsparseMatrix")
        sim
      }
    })
  }
  if (suggest) {
    if (verbose) cat("identifying potential additions (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
    if (!suggest_stopwords) is_stop <- lma_dict(as.function = TRUE)
    full_loadings <- do.call(cbind, lapply(sims, function(x) {
      if (length(x)) {
        agg <- rowMeans(x)
        agg[agg < 0] <- 0
        agg
      } else {
        structure(numeric(length(space_terms)), names = space_terms)
      }
    }))
    loading_cat <- names(cat_names)[max.col(full_loadings)]
    suggested <- lapply(cat_names, function(cat) {
      s <- sims[[cat]]
      if (length(s)) {
        su <- !rownames(s) %in% dict_exp[[cat]] & loading_cat == cat
        loadings <- sort(if (suggest_discriminate && ncol(full_loadings) > 1) {
          nl <- full_loadings[su, colnames(full_loadings) != cat, drop = FALSE]
          (rowMeans(s[su, , drop = FALSE]) - nl[
            rep(seq_len(ncol(nl)), each = nrow(nl)) == max.col(nl)
          ]) / 2
        } else {
          rowMeans(s[su, , drop = FALSE])
        }, TRUE)
        if (length(loadings)) {
          if (!suggest_stopwords) loadings <- loadings[!is_stop(names(loadings))]
          co <- min(length(loadings), max(which(loadings > 0)), suggestion_terms)
          loadings[loadings > loadings[co] + Reduce("-", range(loadings[seq(1, co)])) / 2]
        }
      }
    })
  }
  if (verbose) cat("preparing results (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
  match_counts <- vapply(matches, length, 0)
  term_summary <- data.frame(
    terms[rep(seq_len(nrow(terms)), match_counts), c("category", "term")],
    match = unlist(lapply(matches, names)),
    stringsAsFactors = FALSE
  )
  term_summary <- cbind(term_summary, do.call(rbind, lapply(
    split(
      term_summary[, c("category", "match", "term")],
      term_summary$category
    )[unique(term_summary$category)],
    function(cl) {
      cat <- cl$category[[1]]
      if (pairwise) {
        s <- sims[[cat]]
      } else {
        su <- space_terms %in% dict_exp[[cat]]
        if (multi) {
          aggsim <- NULL
          for (i in seq_along(space)) {
            s <- space[[i]]
            if (sum(su) == 1) {
              sim <- Matrix(1, 1, dimnames = as.list(rep(rownames(s)[su], 2)), sparse = TRUE)
            } else {
              sim <- lma_simets(s[su, , drop = FALSE], metric = "cosine", pairwise = TRUE, symmetrical = TRUE)
              if (is.null(dim(sim))) sim <- as(t(t(sim)), "CsparseMatrix")
              diag(sim) <- 0
              ms <- min(sim)
              sim <- (sim - ms) / (max(sim) - ms) * sign(sim)
              diag(sim) <- 1
            }
            if (i == 1) {
              aggsim <- sim
            } else {
              aggsim <- aggsim + sim
            }
          }
          s <- aggsim / length(space)
        } else {
          s <- lma_simets(space[su, , drop = FALSE], metric = "cosine", pairwise = TRUE, symmetrical = TRUE)
        }
      }
      if (is.null(s)) {
        cbind(sim.term = cl$match, sim.category = 0)
      } else {
        su <- !(cl$match %in% rownames(s))
        if (any(su)) {
          s <- rbind(s, Matrix(
            0, sum(su), ncol(s),
            dimnames = list(cl$match[su], colnames(s)), sparse = TRUE
          ))
        }
        term_sims <- unlist(lapply(unname(split(cl$match, cl$term)[unique(cl$term)]), function(l) {
          if (length(l) == 1) {
            structure(1, names = l)
          } else {
            cols <- l[l %in% colnames(s)]
            s[l, cols[which.min(nchar(cols))]]
          }
        }))
        cat_sims <- s[cl$match, which.max(if (is.null(dim(s))) {
          s
        } else if (ncol(s) == 1) {
          1
        } else {
          colMeans(s[colnames(s), , drop = FALSE])
        })]
        cbind(sim.term = term_sims, sim.category = if (is.null(cat_sims)) 0 else cat_sims)
      }
    }
  )))
  summary <- cbind(data.frame(
    category = cat_names,
    n_terms = vapply(dict, length, 0),
    n_expanded = tapply(match_counts, terms$category, sum)[cat_names],
    sim.space = space_name,
    stringsAsFactors = FALSE
  ), sim = do.call(rbind, lapply(sims, function(s) {
    if (length(s) && !is.null(ncol(s)) && ncol(s) == 1) {
      m <- s[matched_terms, 1]
    } else if (length(s)) {
      s <- s[colnames(s), , drop = FALSE]
      m <- (colSums(s) - 1) / (ncol(s) - 1)
    } else {
      m <- 0
    }
    structure(summary(m), names = c("min", "q1", "median", "mean", "q3", "max"))
  })))
  if (verbose) cat("done (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
  list(expanded = dict_exp, summary = summary, terms = term_summary, suggested = if (suggest) suggested)
}

#' Document-Term Matrix Creation
#'
#' Creates a document-term matrix (dtm) from a set of texts.
#' @param text Texts to be processed. This can be a vector (such as a column in a data frame)
#'   or list. When a list, these can be in the form returned with \code{tokens.only = TRUE},
#'   or a list with named vectors, where names are tokens and values are frequencies or the like.
#' @param exclude A character vector of words to be excluded. If \code{exclude} is a single string
#'   matching \code{'function'}, \code{lma_dict(1:9)} will be used.
#' @param context A character vector used to reformat text based on look- ahead/behind. For example,
#'   you might attempt to disambiguate \emph{like} by reformatting certain \emph{like}s
#'   (e.g., \code{context = c('(i) like*', '(you) like*', '(do) like')}, where words in parentheses are
#'   the context for the target word, and asterisks denote partial matching). This would be converted
#'   to regular expression (i.e., \code{'(? <= i) like\\\\b'}) which, if matched, would be
#'   replaced with a coded version of the word (e.g., \code{"Hey, i like that!"} would become
#'   \code{"Hey, i i-like that!"}). This would probably only be useful for categorization, where a
#'   dictionary would only include one or another version of a word (e.g., the LIWC 2015 dictionary
#'   does something like this with \emph{like}, and LIWC 2007 did something like this with
#'   \emph{kind (of)}, both to try and clean up the posemo category).
#' @param replace.special Logical: if \code{TRUE}, special characters are replaced with regular
#'   equivalents using the \code{\link{lma_dict}} special function.
#' @param numbers Logical: if \code{TRUE}, numbers are preserved.
#' @param punct Logical: if \code{TRUE}, punctuation is preserved.
#' @param urls Logical: if \code{FALSE}, attempts to replace all urls with "repurl".
#' @param emojis Logical: if \code{TRUE}, attempts to replace emojis (e.g., ":(" would be replaced
#'   with "repfrown").
#' @param to.lower Logical: if \code{FALSE}, words with different capitalization are treated as
#'   different terms.
#' @param word.break A regular expression string determining the way words are split. Default is
#'   \code{' +'} which breaks words at one or more blank spaces. You may also like to break by
#'   dashes or slashes (\code{'[ /-]+'}), depending on the text.
#' @param dc.min Numeric: excludes terms appearing in the set number or fewer documents.
#'   Default is 0 (no limit).
#' @param dc.max Numeric: excludes terms appearing in the set number or more. Default
#'   is Inf (no limit).
#' @param sparse Logical: if \code{FALSE}, a regular dense matrix is returned.
#' @param tokens.only Logical: if \code{TRUE}, returns a list rather than a matrix, with these entries:
#'   \tabular{ll}{
#'     \code{tokens} \tab A vector of indices with terms as names. \cr
#'     \code{frequencies} \tab A vector of counts with terms as names. \cr
#'     \code{WC} \tab A vector of term counts for each document. \cr
#'     \code{indices} \tab A list with a vector of token indices for each document. \cr
#'   }
#' @note
#' This is a relatively simple way to make a dtm. To calculate the (more or less) standard forms of
#' LSM and LSS, a somewhat raw dtm should be fine, because both processes essentially use
#' dictionaries (obviating stemming) and weighting or categorization (largely obviating 'stop word'
#' removal). The exact effect of additional processing will depend on the dictionary/semantic space
#' and weighting scheme used (particularly for LSA). This function also does some processing which
#' may matter if you plan on categorizing with categories that have terms with look- ahead/behind assertions
#' (like LIWC dictionaries). Otherwise, other methods may be faster, more memory efficient, and/or more featureful.
#' @return A sparse matrix (or regular matrix if \code{sparse = FALSE}), with a row per \code{text},
#' and column per term, or a list if \code{tokens.only = TRUE}. Includes an attribute with options (\code{opts}),
#' and attributes with word count (\code{WC}) and column sums (\code{colsums}) if \code{tokens.only = FALSE}.
#' @examples
#' text <- c(
#'   "Why, hello there! How are you this evening?",
#'   "I am well, thank you for your inquiry!",
#'   "You are a most good at social interactions person!",
#'   "Why, thank you! You're not all bad yourself!"
#' )
#'
#' lma_dtm(text)
#'
#' # return tokens only
#' (tokens <- lma_dtm(text, tokens.only = TRUE))
#'
#' ## convert those to a regular DTM
#' lma_dtm(tokens)
#'
#' # convert a list-representation to a sparse matrix
#' lma_dtm(list(
#'   doc1 = c(why = 1, hello = 1, there = 1),
#'   doc2 = c(i = 1, am = 1, well = 1)
#' ))
#' @export

lma_dtm <- function(text, exclude = NULL, context = NULL, replace.special = FALSE, numbers = FALSE,
                    punct = FALSE, urls = TRUE, emojis = FALSE, to.lower = TRUE, word.break = " +", dc.min = 0,
                    dc.max = Inf, sparse = TRUE, tokens.only = FALSE) {
  if (!is.null(dim(text))) {
    if (is.character(text[, 1]) || is.factor(text[, 1])) {
      text <- text[, 1]
    } else {
      stop("enter a vector of texts as the first argument")
    }
  }
  if (is.list(text)) {
    if (all(c("tokens", "indices") %in% names(text))) {
      m <- do.call(rbind, lapply(seq_along(text$indices), function(i) {
        if (length(text$indices[[i]])) {
          inds <- as.factor(text$indices[[i]])
          cbind(i, as.integer(levels(inds)), tabulate(inds))
        }
      }))
      dtm <- sparseMatrix(m[, 1], m[, 2],
        x = m[, 3], dims = c(length(text$indices), length(text$tokens)),
        dimnames = list(NULL, if (is.character(text$tokens)) text$tokens else names(text$tokens))
      )
      if (!sparse) dtm <- as.matrix(dtm)
      attr(dtm, "colsums") <- text$frequencies
      attr(dtm, "type") <- "count"
      attr(dtm, "WC") <- text$WC
      attr(dtm, "opts") <- attr(text, "opts")
      attr(dtm, "time") <- attr(text, "time")
      return(dtm)
    } else {
      tokens <- unlist(unname(text), recursive = FALSE)
      cinds <- unique(names(tokens))
      if (is.null(cinds)) {
        text <- unlist(text)
      } else {
        rinds <- rep(seq_along(text), vapply(text, length, 0))
        cinds <- structure(seq_along(cinds), names = cinds)
        dtm <- sparseMatrix(
          rinds, cinds[names(tokens)],
          x = tokens,
          dims = c(length(text), length(cinds)), dimnames = list(names(text), names(cinds))
        )
        if (!sparse) dtm <- as.matrix(dtm)
        return(dtm)
      }
    }
  }
  if (is.null(text)) stop(substitute(text), " not found")
  docnames <- names(text)
  if (is.character(text) && all(nchar(text) < 500) && all(file.exists(text))) {
    text <- if (length(text) != 1 || dir.exists(text)) read.segments(text) else readLines(text)
  }
  text <- paste(" ", text, " ")
  st <- proc.time()[[3]]
  if (replace.special) text <- lma_dict("special", as.function = gsub)(text)
  text <- gsub("(?<=[^a-z0-9])'|'(?=[^a-z0-9])", '"', text, TRUE, TRUE)
  if (!urls) {
    text <- gsub(paste0(
      "\\s[a-z]+://[^\\s]*|www\\.[^\\s]*|\\s[a-z_~-]+\\.[a-z_~-]{2,}[^\\s]*|\\s[a-z_~-]+\\.",
      "(?:io|com|net|org|gov|edu)\\s"
    ), " repurl ", text, TRUE, TRUE)
    text <- gsub("(?<=[A-Z])\\.\\s", " ", text, perl = TRUE)
  }
  text <- gsub("\\s+", " ", text)
  text <- gsub("\\s(etc|st|rd|ft|feat|dr|drs|mr|ms|mrs|messrs|jr|prof)\\.", " \\1tempperiod", text, TRUE)
  text <- gsub("\\s\\.|\\.\\s", " . ", text)
  if (any(punct, emojis, !is.null(context))) {
    special <- lma_dict(special)[[1]]
    if (!missing(context) && length(context) == 1 && grepl("like", context, TRUE)) {
      context <- special[["LIKE"]]
    }
    if (punct) text <- gsub(special[["ELLIPSIS"]], " repellipsis ", text)
    if (emojis) {
      for (type in c("SMILE", "FROWN")) {
        text <- gsub(
          special[[type]], paste0(" rep", tolower(type), " "), text,
          perl = TRUE
        )
      }
    }
    if (!missing(context)) {
      if (!any(grepl("[?=]", context))) {
        context <- gsub("^\\(", "(?<=", context)
        context <- gsub("\\((?!\\?)", "(?=", context, perl = TRUE)
        context <- gsub("(?<![)*])$", "\\\\b", context, perl = TRUE)
        context <- gsub("\\*", "\\\\w*", context, perl = TRUE)
      }
      context <- structure(
        as.list(context),
        names = paste("", gsub("--+", "-", gsub("[\\s^[]|\\\\s]", "-",
          gsub("[^a-z0-9\\s\\\\']|\\\\[wbs]", "", context, TRUE, TRUE),
          perl = TRUE
        )), "")
      )
      for (rn in names(context)) text <- gsub(context[[rn]], rn, text, perl = TRUE)
    }
  }
  if (to.lower) text <- tolower(text)
  if (!is.null(exclude)) {
    if (length(exclude) == 1 && grepl(exclude, "function", TRUE)) {
      exclude <- unlist(lma_dict(), use.names = FALSE)
    } else if (is.list(exclude)) exclude <- unlist(exclude, use.names = FALSE)
  }
  if (!numbers) text <- gsub("[[:punct:]]*[0-9][0-9,.el-]*", " ", text, TRUE, TRUE)
  text <- gsub(paste0(
    "([^a-z0-9.,':/?=#\\s-]|[:/?=#](?=\\s)|(?:(?<=\\s)[:/=-]|,)(?=[a-z])|(?<=[^a-z0-9])",
    "(,(?=[a-z0-9])|[.-](?=[a-z]))|[.,'-](?=[^0-9a-z]|[.,'-]))"
  ), " \\1 ", text, TRUE, TRUE)
  text <- gsub("(\\s[a-z]+)/([a-z]+\\s)", " \\1 / \\2 ", text, TRUE, TRUE)
  text <- gsub("([a-z0-9.,'-].*[^a-z0-9])", " \\1 ", text, TRUE, TRUE)
  text <- gsub("(?<=[a-z])\\s['\u00E7\u00ED]\\s(?=[a-z])", "'", text, TRUE, TRUE)
  if (!punct) {
    text <- gsub("[^A-Za-z0-9'._-]+", " ", text)
    text <- gsub("(?=[a-z])\\.+|(?<=[^a-z0-9])['._-]+|'+\\s", " ", text, TRUE, TRUE)
  }
  text <- gsub("tempperiod", ".", text, fixed = TRUE)
  text <- gsub("^\\s+|\\s(?=\\s)|\\s+$", "", text, perl = TRUE)
  text <- strsplit(text, word.break)
  words <- sort(unique(unlist(text)))
  words <- words[!words == ""]
  if (!is.null(exclude)) {
    if (is.list(exclude)) exclude <- unlist(exclude, use.names = FALSE)
    if (!any(grepl("^", exclude, fixed = TRUE))) exclude <- gsub("\\^\\*|\\*\\$", "", paste0("^", exclude, "$"))
    if (any(ck <- grepl("[[({]", exclude) + grepl("[})]|\\]", exclude) == 1)) {
      exclude[ck] <- gsub("([([{}\\])])", "\\\\\\1", exclude[ck], perl = TRUE)
    }
    words <- grep(paste(exclude, collapse = "|"), words, value = TRUE, invert = TRUE)
  }
  if (tokens.only) {
    m <- match_terms(
      text, words, !grepl("^(?:[[:punct:]]|repellipsis)$", words),
      c(length(text), length(words)), is.null(exclude), TRUE
    )
    names(m) <- c("tokens", "frequencies", "WC", "indices")
    m$tokens <- m$tokens + 1L
    m$tokens <- sort(m$tokens)
    inds <- vector("list", length(text))
    l <- 0
    for (i in seq_along(inds)) {
      if (m$WC[i]) {
        inds[[i]] <- m$indices[seq_len(m$WC[i]) + l] + 1L
        l <- l + m$WC[i]
      } else {
        inds[[i]] <- integer()
      }
    }
    m$indices <- inds
    if (dc.min > 0 || dc.max < Inf) {
      su <- m$frequencies > dc.min & m$frequencies < dc.max
      if (any(!su)) {
        if (!any(su)) {
          warning(
            "document count bounds [", dc.min, ", ", dc.max, "] exlcuded all terms, so they were ignored",
            call. = FALSE
          )
        } else {
          m$frequencies <- m$frequencies[su]
          ex <- m$tokens[!su]
          m$tokens <- m$tokens[su]
          new_inds <- structure(seq_along(m$tokens), names = m$tokens)
          m$tokens[] <- new_inds
          for (i in seq_along(m$indices)) {
            inds <- m$indices[[i]]
            m$indices[[i]] <- unname(new_inds[as.character(inds[!inds %in% ex])])
            m$WC[i] <- length(m$indices[[i]])
          }
        }
      }
    }
    names(m$frequencies) <- names(m$tokens)
  } else {
    msu <- match_terms(
      text, words, !grepl("^(?:[[:punct:]]|repellipsis)$", words),
      c(length(text), length(words)), is.null(exclude), FALSE
    )
    m <- if (sparse) as(msu[[1]], "CsparseMatrix") else as.matrix(msu[[1]])
    if (length(docnames) == nrow(m)) rownames(m) <- docnames
    su <- msu[[3]] > dc.min & msu[[3]] < dc.max
    names(msu[[3]]) <- words
    if (any(!su)) {
      if (!any(su)) {
        warning(
          "document count bounds [", dc.min, ", ", dc.max, "] exlcuded all terms, so they were ignored",
          call. = FALSE
        )
      } else {
        m <- m[, su, drop = FALSE]
      }
    }
    attr(m, "WC") <- unlist(msu[[2]], use.names = FALSE)
    attr(m, "colsums") <- msu[[3]]
    attr(m, "type") <- "count"
    if (!missing(dc.min) || !missing(dc.max)) {
      attr(m, "info") <- paste(
        "a lim of", dc.min, "and", dc.max, "left", sum(su), "of", length(words), "unique terms"
      )
    }
  }
  attr(m, "opts") <- c(numbers = numbers, punct = punct, urls = urls, to.lower = to.lower)
  attr(m, "time") <- c(dtm = proc.time()[[3]] - st)
  m
}

#' Initialize Directories for Dictionaries and Latent Semantic Spaces
#'
#' Creates directories for dictionaries and latent semantic spaces if needed, sets them as the
#' \cr \code{lingmatch.dict.dir} and \code{lingmatch.lspace.dir} options if they are not already set,
#' and creates links to them in their expected locations (\code{'~/Dictionaries'} and
#' \code{'~/Latent Semantic Spaces'}) by default if applicable.
#' @param base Path to a directory in which to create the \code{dict} and \code{lspace} subdirectories.
#' @param dict Path to the dictionaries directory relative to \code{base}.
#' @param lspace Path to the latent semantic spaces directory relative to \code{base}.
#' @param link Logical; if \code{TRUE} (default), the full \code{dict} and/or \code{lspace} paths exist
#' (potentially after being created), and they are not \code{'~/Dictionaries'} or \code{'~/Latent Semantic Spaces'}
#' respectively, junctions (Windows) or symbolic links will be created: \code{~/Dictionaries} \code{<<===>>}
#' \code{dict} and \code{~/Latent Semantic Spaces} \code{<<===>>} \code{lspace}.
#' @return Paths to the [1] dictionaries and [2] latent semantic space directories, or a single path if only
#' \code{dict} or \code{lspace} is specified.
#' @examples
#' \dontrun{
#'
#' # set up the expected dictionary and latent semantic space directories
#' lma_initdirs("~")
#'
#' # set up directories elsewhere, and links to the expected locations
#' lma_initdirs("d:")
#'
#' # point options and create links to preexisting directories
#' lma_initdirs("~/NLP_Resources", "Dicts", "Dicts/Embeddings")
#'
#' # create just a dictionaries directory and set the
#' # lingmatch.dict.dir option without creating a link
#' lma_initdirs(dict = "z:/external_dictionaries", link = FALSE)
#' }
#' @export

lma_initdirs <- function(base = "", dict = "Dictionaries", lspace = "Latent Semantic Spaces", link = TRUE) {
  mck <- c(missing(dict), missing(lspace))
  if (base == "" && all(mck)) {
    base <- gsub('^[\'"]+|[\'"]+$', "", readline(paste0(
      "Enter the path to a directory; ~ is recommended: \n",
      "This is where ", dict, " and ", lspace, " subdirectories will be made."
    )))
    if (grepl("^(?:cancel|exit|stop|q|x|quit|no|nvm|nevermind)?$", tolower(base))) {
      stop(
        "Specify a path to a directory in which you want dictionaries",
        ' and latent semantic spaces to be stored; e.g., "~".',
        call. = FALSE
      )
    }
  }
  dirs <- normalizePath(paste0(base, if (base != "") "/", c(dict, lspace)), "/", FALSE)
  names(dirs) <- c("dict", "lspace")
  if (!all(mck)) dirs <- dirs[!mck]
  if ("dict" %in% names(dirs)) {
    if (!dir.exists(dirs[["dict"]])) dir.create(dirs[["dict"]], recursive = TRUE)
    if (getOption("lingmatch.dict.dir") == "") options(lingmatch.dict.dir = dirs[["dict"]])
  }
  if ("lspace" %in% names(dirs)) {
    if (!dir.exists(dirs[["lspace"]])) dir.create(dirs[["lspace"]], recursive = TRUE)
    if (getOption("lingmatch.lspace.dir") == "") options(lingmatch.lspace.dir = dirs[["lspace"]])
  }
  if (link) {
    linker <- if (Sys.info()[["sysname"]] == "Windows") Sys.junction else file.symlink
    if (dir.exists(dirs[["dict"]]) && !dir.exists("~/Dictionaries")) {
      linker(dirs[["dict"]], "~/Dictionaries")
      message("created dictionaries link:\n  ", dirs[["dict"]], " <<==>> ", path.expand("~/Dictionaries"))
    }
    if (dir.exists(dirs[["lspace"]]) && !dir.exists("~/Latent Semantic Spaces")) {
      linker(dirs[["lspace"]], "~/Latent Semantic Spaces")
      message("created latent space link:\n  ", dirs[["lspace"]], " <<==>> ", path.expand("~/Latent Semantic Spaces"))
    }
  }
  dirs
}

#' Standardize a Latent Semantic Space
#'
#' Reformat a .rda file which has a matrix with terms as row names, or a plain-text embeddings file
#' which has a term at the start of each line, and consistent delimiting characters. Plain-text files
#' are processed line-by-line, so large spaces can be reformatted RAM-conservatively.
#'
#' @param infile Name of the .rda or plain-text file relative to \code{dir}, \cr
#'   e.g., "default.rda" or "glove/glove.6B.300d.txt".
#' @param name Base name of the reformatted file and term file; e.g., "glove" would result in
#'   \code{glove.dat} and \code{glove_terms.txt} in \code{outdir}.
#' @param sep Delimiting character between values in each line, e.g., \code{" "} or \code{"\\t"}.
#'   Only applies to plain-text files.
#' @param digits Number of digits to round values to; default is 9.
#' @param dir Path to folder containing \code{infile}s. \cr Default is \code{getOption('lingmatch.lspace.dir')},
#'   which must be set in the current session. If this is not specified and \code{infile} is a full path,
#'   \code{dir} will be set to \code{infile}'s parent directory.
#' @param outdir Path to folder in which to save standardized files; default is \code{dir}.
#' @param remove A string with a regex pattern to be removed from term names \cr (i.e., \code{gsub(remove,}
#'   \code{"", term)}); default is \code{""}, which is ignored.
#' @param term_check A string with a regex pattern by which to filter terms; i.e., only lines with fully
#'   matched terms are written to the reformatted file. The default attempts to retain only regular words, including
#'   those with dashes, foreword slashes, and periods. Set to an empty string (\code{""}) to write all lines
#'   regardless of term.
#' @param verbose Logical: if \code{TRUE}, prints the current line number and its term to the console every 1,000 lines.
#'   Only applies to plain-text files.
#' @family Latent Semantic Space functions
#' @return Path to the standardized [1] data file and [2] terms file if applicable.
#' @examples
#' \dontrun{
#'
#' # from https://sites.google.com/site/fritzgntr/software-resources/semantic_spaces
#' standardize.lspace("EN_100k_lsa.rda", "100k_lsa")
#'
#' # from https://fasttext.cc/docs/en/english-vectors.html
#' standardize.lspace("crawl-300d-2M.vec", "facebook_crawl")
#'
#' # Standardized versions of these spaces can also be downloaded with download.lspace.
#' }
#' @export

standardize.lspace <- function(infile, name, sep = " ", digits = 9, dir = getOption("lingmatch.lspace.dir"),
                               outdir = dir, remove = "", term_check = "^[a-zA-Z]+$|^['a-zA-Z][a-zA-Z.'\\/-]*[a-zA-Z.]$", verbose = FALSE) {
  if (is.character(infile) && file.exists(infile) && missing(dir)) {
    dir <- dirname(normalizePath(infile, mustWork = FALSE))
    if (missing(outdir)) outdir <- dir
  } else if (dir == "") {
    if (outdir != "") {
      dir <- outdir
    } else {
      stop(paste(
        "specify a directory (dir), or set the lspace directory option\n(e.g.,",
        'options(lingmatch.lspace.dir = ~/Latent Semantic Spaces")) or call lma_initdir()'
      ), call. = FALSE)
    }
  }
  if (!is.character(term_check)) term_check <- ""
  if (is.character(infile)) {
    ip <- normalizePath(paste0(dir, "/", infile), "/", FALSE)
    if (!file.exists(ip)) ip <- infile
  }
  op <- normalizePath(paste0(outdir, "/", name), "/", FALSE)
  fs <- op
  if (!is.character(infile) || grepl("\\.rda$", infile)) {
    if (is.character(infile)) {
      f <- load(ip)
      o <- get(f)
    } else {
      o <- infile
    }
    o <- round(o, digits)
    ot <- rownames(o)
    if (remove != "") ot <- gsub(remove, "", ot)
    if (term_check != "") {
      su <- grepl(term_check, ot)
      o <- o[su, ]
      ot <- ot[su]
    }
    fs <- paste0(op, c(".dat", "_terms.txt"))
    writeLines(ot, fs[2])
    write(formatC(t(o), digits, 0, "f"), fs[1], ncol(o))
    if (is.character(infile)) rm(f, "o")
  } else {
    if (!file.exists(ip)) stop("infile does not exist: ", ip)
    if (!grepl(term_check, scan(ip, "", 1, sep = sep, quiet = TRUE))) {
      stop("infile does not appear to start with a term: ", ip)
    }
    fs <- paste0(op, ".dat")
    reformat_embedding(ip, op, sep, digits, remove, term_check, verbose)
  }
  message("created ", op, ".dat\nfrom ", ip)
  fs
}

#' Read and Segment Multiple Texts
#'
#' Split texts by word count or specific characters. Input texts directly, or read them in from files.
#'
#' @param path Path to a folder containing files, or a vector of paths to files. If no folders or files are
#'   recognized in \code{path}, it is treated as \code{text}.
#' @param segment Specifies how the text of each file should be segmented. If a character, split at that character;
#'   '\\n' by default. If a number, texts will be broken into that many segments, each with a roughly equal number of
#'   words.
#' @param ext The extension of the files you want to read in. '.txt' by default.
#' @param subdir Logical; if \code{TRUE}, files in folders in \code{path} will also be included.
#' @param segment.size Logical; if specified, \code{segment} will be ignored, and texts will be broken into
#'   segments containing roughly \code{segment.size} number of words.
#' @param bysentence Logical; if \code{TRUE}, and \code{segment} is a number or \code{segment.size} is specified,
#'   sentences will be kept together, rather than potentially being broken across segments.
#' @param end_in_quotes Logical; if \code{FALSE}, sentence-ending marks (\code{.?!}) will not be considered when
#'   immediately followed by a quotation mark. For example, \code{'"Word." Word.'} would be considered one sentence.
#' @param preclean Logical; if \code{TRUE}, text will be cleaned with \code{lma_dict(special)} before
#'   segmentation.
#' @param text A character vector with text to be split, used in place of \code{path}. Each entry is treated as a file.
#' @return A \code{data.frame} with columns for file names (\code{input}),
#' segment number within file (\code{segment}), word count for each segment (\code{WC}), and the text of
#' each segment (\code{text}).
#' @examples
#' # split preloaded text
#' read.segments("split this text into two segments", 2)
#'
#' \dontrun{
#'
#' # read in all files from the package directory
#' texts <- read.segments(path.package("lingmatch"), ext = "")
#' texts[, -4]
#'
#' # segment .txt files in dir in a few ways:
#' dir <- "path/to/files"
#'
#' ## into 1 line segments
#' texts_lines <- read.segments(dir)
#'
#' ## into 5 even segments each
#' texts_5segs <- read.segments(dir, 5)
#'
#' ## into 50 word segments
#' texts_50words <- read.segments(dir, segment.size = 50)
#'
#' ## into 1 sentence segments
#' texts_1sent <- read.segments(dir, segment.size = 1, bysentence = TRUE)
#' }
#' @export

read.segments <- function(path = ".", segment = NULL, ext = ".txt", subdir = FALSE, segment.size = -1,
                          bysentence = FALSE, end_in_quotes = TRUE, preclean = FALSE, text = NULL) {
  if (any(path == "")) path[path == ""] <- "."
  if (!any(file.exists(sub("[\\/]+$", "", path)))) {
    ck_text <- TRUE
    files <- path
  } else {
    ck_text <- !is.null(text)
    files <- if (ck_text) {
      text
    } else {
      dirs <- list.dirs(path, recursive = subdir)
      files <- if (any(dir.exists(path))) {
        unique(list.files(path, ext, recursive = subdir, full.names = TRUE))
      } else {
        path[file.exists(path)]
      }
      files[!files %in% dirs]
    }
  }
  if (missing(segment) && missing(segment.size)) {
    segment <- if (length(path) == 1 && (ck_text || !dir.exists(path))) "\n" else 1
  }
  if (length(files)) {
    err <- function(e) NULL
    args <- list(what = character(), quote = "", na.strings = "", quiet = TRUE)
    if (is.character(segment) && segment.size == -1) args$sep <- segment
    do.call(rbind, lapply(seq_along(files), function(fi) {
      f <- files[fi]
      args[[if (ck_text) "text" else "file"]] <- f
      WC <- NULL
      if (ck_text || file.exists(f)) {
        if (is.numeric(segment) || segment.size > 0) {
          words <- tryCatch(do.call(scan, args), error = err)
          if (!length(words)) {
            return(NULL)
          }
          if (preclean) words <- lma_dict("special", as.function = gsub)(words)
          TWC <- length(words)
          if (segment.size == -1) segment.size <- ceiling(TWC / segment)
          if (bysentence) {
            if (!is.null(segment) && is.numeric(segment)) {
              lines <- character(segment)
              WC <- numeric(segment)
            } else {
              lines <- NULL
              WC <- NULL
            }
            sentends <- grep("[.?!]$", if (end_in_quotes) {
              gsub(if (preclean) {
                '["\']+'
              } else {
                paste0("(?:", paste(
                  c(
                    '["\']',
                    unlist(lma_dict("special")$special$CHARACTERS[c('"', "'")])
                  ),
                  collapse = "|"
                ), ")+$")
              }, "", words)
            } else {
              words
            })
            if (length(sentends)) {
              sentends <- sentends[!grepl(paste0(
                "\\.[a-z]|^(?:[a-z]|[ivxm]+|\\d+|ans|govt|apt|etc|",
                "st|rd|ft|feat|dr|drs|mr|ms|mrs|messrs|jr|prof)[.?!]$"
              ), words[sentends], TRUE, TRUE)]
            }
            sentends <- c(1, sentends)
            nsents <- length(sentends)
            if (sentends[nsents] != TWC) {
              sentends <- c(sentends, TWC)
              nsents <- nsents + 1
            }
            i <- s <- p <- 1
            while (p < nsents && sum(WC) < TWC) {
              WC[i] <- 0
              while (p < nsents && WC[i] < segment.size) {
                p <- p + 1
                WC[i] <- (sentends[p] - s) + 1
              }
              lines[i] <- paste(words[seq(s, sentends[p])], collapse = " ")
              s <- sentends[p] + 1
              i <- i + 1
            }
          } else {
            segment <- ceiling(TWC / segment.size)
            lines <- character(segment)
            WC <- rep(segment.size, segment)
            WCC <- 0
            for (i in seq_len(segment)) {
              if (WCC + WC[i] > TWC) WC[i] <- TWC - WCC
              lines[i] <- paste(words[seq(WCC + 1, WCC + WC[i])], collapse = " ")
              WCC <- WCC + WC[i]
            }
          }
        } else {
          lines <- tryCatch(do.call(scan, args), error = err)
          if (!length(lines)) {
            return(NULL)
          }
        }
      } else {
        lines <- ""
      }
      data.frame(
        input = if (ck_text) fi else f, segment = seq_along(lines),
        WC = if (is.null(WC)) vapply(strsplit(lines, "\\s+"), function(sp) sum(sp != ""), 0) else WC,
        text = lines, stringsAsFactors = FALSE
      )
    }))
  } else {
    warning(
      "no files found", if (!subdir) "; might try setting subdir to TRUE to include files in folders"
    )
  }
}

#' Similarity Calculations
#'
#' Enter a numerical matrix, set of vectors, or set of matrices to calculate similarity per vector.
#'
#' @param a A vector or matrix. If a vector, \code{b} must also be provided. If a matrix and \code{b}
#'   is missing, each row will be compared. If a matrix and \code{b} is not missing, each row will
#'   be compared with \code{b} or each row of \code{b}.
#' @param b A vector or matrix to be compared with \code{a} or rows of \code{a}.
#' @param metric A character or vector of characters at least partially matching one of the
#'   available metric names (or 'all' to explicitly include all metrics),
#'   or a number or vector of numbers indicating the metric by index:
#'   \itemize{
#'     \item \strong{\code{jaccard}}: \code{sum(a & b) / sum(a | b)}
#'     \item \strong{\code{euclidean}}: \code{1 / (1 + sqrt(sum((a - b) ^ 2)))}
#'     \item \strong{\code{canberra}}: \code{mean(1 - abs(a - b) / (a + b))}
#'     \item \strong{\code{cosine}}: \code{sum(a * b) / sqrt(sum(a ^ 2 * sum(b ^ 2)))}
#'     \item \strong{\code{pearson}}: \code{(mean(a * b) - (mean(a) * mean(b))) /} \cr
#'       \code{sqrt(mean(a ^ 2) - mean(a) ^ 2) / sqrt(mean(b ^ 2) - mean(b) ^ 2)}
#'   }
#' @param group If \code{b} is missing and \code{a} has multiple rows, this will be used to make
#'   comparisons between rows of \code{a}, as modified by \code{agg} and \code{agg.mean}.
#' @param lag Amount to adjust the \code{b} index; either rows if \code{b} has multiple rows (e.g.,
#'   for \code{lag = 1}, \code{a[1, ]} is compared with \code{b[2, ]}), or values otherwise (e.g.,
#'   for \code{lag = 1}, \code{a[1]} is compared with \code{b[2]}). If \code{b} is not supplied,
#'   \code{b} is a copy of \code{a}, resulting in lagged self-comparisons or autocorrelations.
#' @param agg Logical: if \code{FALSE}, only the boundary rows between groups will be compared, see
#'   example.
#' @param agg.mean Logical: if \code{FALSE} aggregated rows are summed instead of averaged.
#' @param pairwise Logical: if \code{FALSE} and \code{a} and \code{b} are matrices with the same number of
#'   rows, only paired rows are compared. Otherwise (and if only \code{a} is supplied), all pairwise
#'   comparisons are made.
#' @param symmetrical Logical: if \code{TRUE} and pairwise comparisons between \code{a} rows were made,
#'   the results in the lower triangle are copied to the upper triangle.
#' @param mean Logical: if \code{TRUE}, a single mean for each metric is returned per row of \code{a}.
#' @param return.list Logical: if \code{TRUE}, a list-like object will always be returned, with an entry
#'   for each metric, even when only one metric is requested.
#' @details
#' Use \code{\link[RcppParallel]{setThreadOptions}} to change parallelization options; e.g., run
#' RcppParallel::setThreadOptions(4) before a call to lma_simets to set the number of CPU
#' threads to 4.
#' @return Output varies based on the dimensions of \code{a} and \code{b}:
#'   \itemize{
#'     \item \strong{Out:} A vector with a value per metric. \cr
#'       \strong{In:} Only when \code{a} and \code{b} are both vectors.
#'     \item \strong{Out:} A vector with a value per row. \cr
#'       \strong{In:} Any time a single value is expected per row: \code{a} or \code{b} is a vector,
#'       \code{a} and \code{b} are matrices with the same number of rows and \code{pairwise = FALSE}, a group is
#'       specified, or \code{mean = TRUE}, and only one metric is requested.
#'     \item \strong{Out:} A data.frame with a column per metric. \cr
#'       \strong{In:} When multiple metrics are requested in the previous case.
#'     \item \strong{Out:} A sparse matrix with a \code{metric} attribute with the metric name. \cr
#'       \strong{In:} Pairwise comparisons within an \code{a} matrix or between
#'       an \code{a} and \code{b} matrix, when only 1 metric is requested.
#'     \item \strong{Out:} A list with a sparse matrix per metric. \cr
#'       \strong{In:} When multiple metrics are requested in the previous case.
#'   }
#' @examples
#' text <- c(
#'   "words of speaker A", "more words from speaker A",
#'   "words from speaker B", "more words from speaker B"
#' )
#' (dtm <- lma_dtm(text))
#'
#' # compare each entry
#' lma_simets(dtm)
#'
#' # compare each entry with the mean of all entries
#' lma_simets(dtm, colMeans(dtm))
#'
#' # compare by group (corresponding to speakers and turns in this case)
#' speaker <- c("A", "A", "B", "B")
#'
#' ## by default, consecutive rows from the same group are averaged:
#' lma_simets(dtm, group = speaker)
#'
#' ## with agg = FALSE, only the rows at the boundary between
#' ## groups (rows 2 and 3 in this case) are used:
#' lma_simets(dtm, group = speaker, agg = FALSE)
#' @export

lma_simets <- function(a, b = NULL, metric = NULL, group = NULL, lag = 0, agg = TRUE, agg.mean = TRUE,
                       pairwise = TRUE, symmetrical = FALSE, mean = FALSE, return.list = FALSE) {
  mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson")
  if (missing(metric) && length(b) == 1 && !grepl(" ", b) &&
    any(grepl(tolower(substr(b, 1, 3)), mets, fixed = TRUE))) {
    metric <- b
    b <- NULL
  }
  met <- match_metric(metric)
  if (!length(met$selected)) {
    stop(
      "no recognized metric; should match one of ",
      paste0(mets, collapse = ", "), ", or all"
    )
  }
  st <- proc.time()[[3]]
  slots <- c("i", "p", "x", "Dim")
  if ((is.character(a) || is.factor(a)) && any(grepl("[a-zA-Z]", a))) {
    a <- lma_dtm(a)
  } else if (is.data.frame(a)) a <- Matrix(as.matrix(a), sparse = TRUE)
  if (is.null(b) && !missing(lag) && is.null(dim(a))) b <- a
  if (is.null(b)) {
    n <- dim(a)[1]
    if (is.null(n) || n < 2) stop("a must have more than 1 row when b is not provided", call. = FALSE)
    if (is.null(group)) {
      if (!all(slots %in% slotNames(a))) a <- as(a, "CsparseMatrix")
      res <- calculate_similarities(a, NULL, 2, met$dummy)
      for (i in seq_along(res)) attr(res[[i]], "metric") <- met$selected[i]
    } else {
      if (length(group) != n) stop("group is not the same length as a or columns in a")
      ager <- if (agg.mean) colMeans else colSums
      l <- length(group)
      chunks <- NULL
      i <- 1
      while (i < l) {
        st <- i
        g <- group[i]
        while (i < l && g == group[i + 1]) i <- i + 1
        chunks <- c(chunks, list(seq(st, i)))
        i <- i + 1
      }
      if (!any(chunks[[length(chunks)]] == l)) chunks <- c(chunks, list(l))
      rows <- character(length(chunks) - 1)
      res <- as.data.frame(matrix(0, length(chunks) - 1, sum(met$dummy), dimnames = list(NULL, met$selected)))
      for (i in seq_len(length(chunks) - 1)) {
        s <- chunks[[i]]
        sa <- if (agg) s else s[length(s)]
        ta <- ager(a[sa, , drop = FALSE])
        s <- chunks[[i + 1]]
        sb <- if (agg) s else s[1]
        tb <- ager(a[sb, , drop = FALSE])
        res[i, ] <- vector_similarity(ta, tb, met$dummy)
        rows[i] <- paste(paste(sa, collapse = ", "), "<->", paste(sb, collapse = ", "))
      }
      rownames(res) <- rows
    }
  } else {
    if ((is.character(b) || is.factor(b)) && any(grepl("[a-zA-Z]", b))) {
      b <- lma_dtm(b)
    } else if (is.data.frame(b)) b <- Matrix(as.matrix(b), sparse = TRUE)
    bn <- if (is.null(dim(b))) length(b) else dim(b)[1]
    if (lag && abs(lag) >= bn) lag <- if (lag < 0) -bn + 1 else bn - 1
    res <- if (is.null(dim(b)) && length(a) == bn && (is.null(dim(a)) || any(dim(a) == 1))) {
      b <- as.numeric(b)
      if (lag) b <- if (lag < 0) c(b[-seq_len(-lag)], numeric(-lag)) else c(numeric(lag), b)[seq_len(bn)]
      vector_similarity(as.numeric(a), b, met$dummy)
    } else {
      if (is.null(dim(a))) a <- Matrix(a, 1, dimnames = list(NULL, names(a)), sparse = TRUE)
      if (!all(slots %in% slotNames(a))) a <- as(a, "CsparseMatrix")
      if (is.null(dim(b))) b <- Matrix(b, 1, dimnames = list(NULL, names(b)), sparse = TRUE)
      if (!all(slots %in% slotNames(b))) b <- as(b, "CsparseMatrix")
      d <- c(dim(a), dim(b))
      if (d[2] != d[4]) {
        ns <- colnames(a)
        if (!is.null(ns)) {
          ns <- ns[ns %in% colnames(b)]
          if (length(ns)) {
            a <- a[, ns, drop = FALSE]
            b <- b[, ns, drop = FALSE]
          }
        }
        d <- c(dim(a), dim(b))
        if (d[2] != d[4]) {
          stop("a and b have a different number of columns, which could not be aligned by name")
        }
      }
      if (lag) {
        b <- if (lag > 0) {
          rbind(Matrix(0, lag, d[4], sparse = TRUE), b[-(seq_len(lag) + d[3] - lag), ])
        } else {
          rbind(b[-seq_len(-lag), ], Matrix(0, -lag, d[4], sparse = TRUE))
        }
      }
      type <- if (((missing(pairwise) || !pairwise) && d[1] == d[3]) ||
        d[3] == 1) {
        1
      } else {
        3
      }
      calculate_similarities(a, b, type, met$dummy)
    }
  }
  if ("list" %in% class(res) && length(res)) {
    pairwise <- "dtCMatrix" %in% class(res[[1]])
    if ((pairwise && symmetrical) || mean) {
      for (i in seq_along(res)) {
        if (pairwise && (symmetrical || mean)) res[[i]] <- forceSymmetric(res[[i]], "L")
        if (mean) {
          res[[i]] <- if (is.null(dim(res[[i]]))) {
            mean(res[[i]], na.rm = TRUE)
          } else {
            (rowSums(res[[i]], TRUE) - 1) / (ncol(res[[i]]) - 1)
          }
        }
      }
    }
    if (is.null(dim(res[[1]]))) {
      rn <- if (!is.na(nd <- which(c(dim(a), dim(b)) == length(res[[1]]))[1]) && !is.null(rownames(if (nd == 1) a else b))) {
        rownames(if (nd == 1) a else b)
      } else {
        NULL
      }
      if (length(met$selected) == 1) {
        if (length(rn) == length(res[[1]])) names(res[[1]]) <- rn
      } else {
        attr(res, "row.names") <- if (length(rn) == length(res[[1]])) rn else seq_along(res[[1]])
        attr(res, "class") <- "data.frame"
      }
    }
    if (!return.list && length(met$selected) == 1) res <- res[[1]]
  }
  attr(res, "time") <- c(simets = proc.time()[[3]] - st)
  res
}

#' Select Latent Semantic Spaces
#'
#' Retrieve information and links to latent semantic spaces
#' (sets of word vectors/embeddings) available at \href{https://osf.io/489he}{osf.io/489he},
#' and optionally download their term mappings (\href{https://osf.io/xr7jv}{osf.io/xr7jv}).
#'
#' @param query A character used to select spaces, based on names or other features.
#'   If length is over 1, \code{get.map} is set to \code{TRUE}. Use \code{terms} alone to select
#'   spaces based on term coverage.
#' @param dir Path to a directory containing \code{lma_term_map.rda} and downloaded spaces; \cr will look in
#'   \code{getOption('lingmatch.lspace.dir')} and \code{'~/Latent Semantic Spaces'} by default.
#' @param terms A character vector of terms to search for in the downloaded term map, to calculate
#'   coverage of spaces, or select by coverage if \code{query} is not specified.
#' @param get.map Logical; if \code{TRUE} and \code{lma_term_map.rda} is not found in
#'   \code{dir}, the term map (\href{https://osf.io/xr7jv}{lma_term_map.rda}) is
#'   downloaded and decompressed.
#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF,
#'   and compares it with that calculated from the downloaded file to check its integrity.
#' @param mode Passed to \code{\link{download.file}} when downloading the term map.
#' @return A list with varying entries:
#'   \itemize{
#'     \item \strong{\code{info}}: The version of \href{https://osf.io/9yzca}{osf.io/9yzca} stored internally; a
#'       \code{data.frame}  with spaces as row names, and information about each space in columns:
#'         \itemize{
#'           \item \strong{\code{terms}}: number of terms in the space
#'           \item \strong{\code{corpus}}: corpus(es) on which the space was trained
#'           \item \strong{\code{model}}: model from which the space was trained
#'           \item \strong{\code{dimensions}}: number of dimensions in the model (columns of the space)
#'           \item \strong{\code{model_info}}: some parameter details about the model
#'           \item \strong{\code{original_max}}: maximum value used to normalize the space; the original
#'             space would be \code{(vectors *} \code{original_max) /} \code{100}
#'           \item \strong{\code{osf_dat}}: OSF id for the \code{.dat} files; the URL would be
#'             https://osf.io/\code{osf_dat}
#'           \item \strong{\code{osf_terms}}: OSF id for the \code{_terms.txt} files; the URL would be
#'             https://osf.io/\code{osf_terms}
#'           \item \strong{\code{wiki}}: link to the wiki for the space
#'           \item \strong{\code{downloaded}}: path to the \code{.dat} file if downloaded,
#'             and \code{''} otherwise.
#'         }
#'     \item \strong{\code{selected}}: A subset of \code{info} selected by \code{query}.
#'     \item \strong{\code{term_map}}: If \code{get.map} is \code{TRUE} or \code{lma_term_map.rda} is found in
#'       \code{dir}, a copy of \href{https://osf.io/xr7jv}{osf.io/xr7jv}, which has space names as
#'       column names, terms as row names, and indices as values, with 0 indicating the term is not
#'       present in the associated space.
#'   }
#' @family Latent Semantic Space functions
#' @examples
#' # just retrieve information about available spaces
#' spaces <- select.lspace()
#' spaces$info[1:10, c("terms", "dimensions", "original_max")]
#'
#' # retrieve all spaces that used word2vec
#' w2v_spaces <- select.lspace("word2vec")$selected
#' w2v_spaces[, c("terms", "dimensions", "original_max")]
#'
#' \dontrun{
#'
#' # select spaces by terms
#' select.lspace(terms = c(
#'   "part-time", "i/o", "'cause", "brexit", "debuffs"
#' ))$selected[, c("terms", "coverage")]
#' }
#' @export

select.lspace <- function(query = NULL, dir = getOption("lingmatch.lspace.dir"), terms = NULL,
                          get.map = FALSE, check.md5 = TRUE, mode = "wb") {
  if (ckd <- dir == "") dir <- "~/Latent Semantic Spaces"
  if (!missing(query) && !is.character(query) && !is.null(colnames(query))) {
    terms <- colnames(query)
    query <- NULL
  }
  map_path <- normalizePath(paste0(dir, "/lma_term_map.rda"), "/", FALSE)
  if (missing(get.map) && (file.exists(map_path) || length(terms) > 1)) get.map <- TRUE
  if (!exists("lma_term_map")) lma_term_map <- NULL
  if (get.map && ckd && !dir.exists(dir)) stop("specify `dir` or use `lma_initdirs()` to download the term map")
  if (get.map && !(file.exists(map_path) || !is.null(lma_term_map))) {
    fi <- tryCatch(
      strsplit(readLines("https://api.osf.io/v2/files/xr7jv", 1, TRUE, FALSE, "utf-8"), '[:,{}"]+')[[1]],
      error = function(e) NULL
    )
    if (!file.exists(map_path) || (!is.null(fi) && md5sum(map_path) != fi[which(fi == "md5") + 1])) {
      dir.create(dir, FALSE, TRUE)
      status <- tryCatch(download.file(
        "https://osf.io/download/xr7jv", map_path,
        mode = mode
      ), error = function(e) 1)
      if (!status && check.md5 && !is.null(fi)) {
        ck <- md5sum(map_path)
        if (fi[which(fi == "md5") + 1] == ck) {
          load(map_path)
          save(lma_term_map, file = map_path, compress = FALSE)
        } else {
          warning(paste0(
            "The term map's MD5 (", ck, ") does not seem to match the one on record;\n",
            "double check and try manually downloading at https://osf.io/xr7jv/?show=revision"
          ))
        }
      }
    }
  } else if (!file.exists(map_path) && !is.null(terms)) {
    stop("The term map could not be found; specify dir or run lma_initdirs('~') to download it", call. = FALSE)
  }
  r <- list(info = lss_info, selected = lss_info[NULL, ])
  r$info[, "wiki"] <- paste0("https://osf.io/489he/wiki/", rownames(lss_info))
  r$info[, "downloaded"] <- normalizePath(paste0(dir, "/", rownames(r$info), ".dat"), "/", FALSE)
  r$info[!file.exists(r$info[, "downloaded"]), "downloaded"] <- ""
  if (get.map) {
    if (!is.null(lma_term_map)) {
      r$term_map <- lma_term_map
    } else if (file.exists(map_path) && is.null(lma_term_map)) {
      load(map_path)
      r$term_map <- lma_term_map
      rm(list = "lma_term_map")
    }
  }
  if (!is.null(terms)) {
    if (length(terms) > 1 && "term_map" %in% names(r)) {
      terms <- tolower(terms)
      overlap <- terms[terms %in% rownames(r$term_map)]
      if (length(overlap)) {
        r$info$coverage <- colSums(r$term_map[overlap, , drop = FALSE] != 0) / length(terms)
        r$selected <- r$info[order(r$info$coverage, decreasing = TRUE)[1:5], ]
        r$space_terms <- overlap
      } else {
        warning("no terms were found")
      }
    }
  }
  if (!is.null(query)) {
    query <- paste0(query, collapse = "|")
    if (!length(sel <- grep(query, rownames(lss_info), TRUE))) {
      collapsed <- vapply(
        seq_len(nrow(lss_info)),
        function(r) paste(c(rownames(lss_info)[r], lss_info[r, ]), collapse = " "), ""
      )
      if (!length(sel <- grep(query, collapsed, TRUE))) {
        sel <- grep(paste(strsplit(query, "[[:space:],|]+")[[1]], collapse = "|"), collapsed, TRUE)
      }
    }
    if (length(sel)) r$selected <- r$info[sel, ]
  }
  r
}

#' Download Latent Semantic Spaces
#'
#' Downloads the specified semantic space from \href{https://osf.io/489he}{osf.io/489he}.
#'
#' @param space Name of one or more spaces you want to download, or \code{'all'} for all available.
#'  \code{'100k_lsa'} is the default, and some other common options might be \code{'google'}, \code{'facebook'},
#'  or \code{'glove'}. See \href{https://osf.io/489he/wiki/home}{osf.io/489he/wiki} for more information,
#'  and a full list of spaces.
#' @param decompress Logical; if \code{TRUE} (default), decompresses the downloaded file
#'  with the \code{bunzip2} system command assuming it is available \cr (as indicated by
#'  \code{Sys.which('bunzip2')}).
#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF,
#'  and compares it with that calculated from the downloaded file to check its integrity.
#' @param mode A character specifying the file write mode; default is 'wb'. See
#'  \code{\link{download.file}}.
#' @param dir Directory in which to save the space. Specify this here, or set the lspace directory option
#'  (e.g., \code{options(lingmatch.lspace.dir = '~/Latent Semantic Spaces')}), or use
#'  \code{\link{lma_initdirs}} to initialize a directory.
#' @param overwrite Logical; if \code{TRUE}, will replace existing files.
#' @family Latent Semantic Space functions
#' @return A character vector with paths to the [1] data and [2] term files.
#' @examples
#' \dontrun{
#'
#' download.lspace("glove_crawl", dir = "~/Latent Semantic Spaces")
#' }
#' @export
#' @importFrom utils download.file
#' @importFrom tools md5sum

download.lspace <- function(space = "100k_lsa", decompress = TRUE,
                            check.md5 = TRUE, mode = "wb", dir = getOption("lingmatch.lspace.dir"),
                            overwrite = FALSE) {
  download.resource(
    "lspace", space,
    decompress = decompress, check.md5 = check.md5, mode = mode, dir = dir, overwrite = overwrite
  )
}

#include <Rcpp.h>
using namespace std;
using namespace Rcpp;

/**
 * @brief Converts tokenized texts to indices, as part of document-term matrix
 * creation.
 *
 * @param tokens A list of character vectors; tokenized texts.
 * @param terms A character vector containing all terms.
 * @param isword A logical vector indicating whether each term is a word or.
 * punctuation mark (and so should count toward word-count).
 * @param dim Dimensions of the output matrix [n rows, n cols].
 * @param complete Logical indicating whether to all terms tokens should be
 * counted, or only those in `terms` should count.
 * @param tokensonly Logical specifying the return type.
 * @return List; a term map with row indices (if `tokensonly`), or a sparse
 * matrix.
 */

// [[Rcpp::export]]
List match_terms(const List &tokens, const CharacterVector &terms,
                 const LogicalVector &isword, const IntegerVector &dim,
                 const bool &complete, const bool &tokensonly) {
  const int n = tokens.length();
  int i = terms.length(), colindex, s = 0, un, p, ck = 1e3;
  vector<int> rows, columns, rowsums(n), colsums(i);
  vector<double> counts;
  CharacterVector uses;
  unordered_map<String, int> dict;
  for (; i--;) dict.insert({terms[i], i});
  if (tokensonly) {
    for (; s < n; s++) {
      uses = tokens[s];
      for (un = uses.length(), p = 0; p < un; p++)
        if (complete || dict.find(uses[p]) != dict.end()) {
          colindex = dict.at(uses[p]);
          rows.push_back(colindex);
          colsums[colindex]++;
          rowsums[s]++;
        }
      if (!--ck) {
        checkUserInterrupt();
        ck = 1e3;
      }
    }
    return List::create(dict, colsums, rowsums, rows);
  } else {
    for (; s < n; s++) {
      uses = tokens[s];
      for (un = uses.length(), p = 0; p < un; p++)
        if (complete || dict.find(uses[p]) != dict.end()) {
          colindex = dict.at(uses[p]);
          colsums[colindex]++;
          if (isword[colindex]) rowsums[s]++;
          if (p == 0 || uses[p] != uses[p - 1]) {
            rows.push_back(s);
            columns.push_back(colindex);
            counts.push_back(1);
            i++;
          } else
            counts[i]++;
        }
      if (!--ck) {
        checkUserInterrupt();
        ck = 1e3;
      }
    }
    S4 dtm("dgTMatrix");
    dtm.slot("Dim") = dim;
    dtm.slot("Dimnames") = List::create(R_NilValue, terms);
    dtm.slot("i") = rows;
    dtm.slot("j") = columns;
    dtm.slot("x") = counts;
    return List::create(dtm, rowsums, colsums);
  }
}

// [[Rcpp::depends(BH)]]
#include <Rcpp.h>

#include <fstream>
#define BOOST_REGEX_USE_C_LOCALE
#include <boost/regex.hpp>
using namespace std;
using namespace Rcpp;

/**
 * @brief Standardizes embeddings.
 *
 * @param infile Path to original embeddings file.
 * @param outfile Base path to reformatted files, which will translate to a
 * `.dat` and `_terms.txt` file.
 * @param sep Separating character(s).
 * @param digits Number of digits to round to.
 * @param remove String to remove to any terms.
 * @param term_check Regex pattern used to check terms for inclusion.
 * @param verbose Logical indicating whither to print every thousandth line
 * during reformatting.
 * @return void; writes output files.
 */

// [[Rcpp::export]]
void reformat_embedding(const std::string &infile, const std::string &outfile,
                        const char &sep = ' ', const int &digits = 9,
                        const std::string &remove = "",
                        const std::string &term_check =
                            "^[a-zA-Z]+$|^['a-zA-Z][a-zA-Z.'\\/-]*[a-zA-Z.]$",
                        const bool &verbose = false) {
  ifstream d(infile);
  ofstream o(outfile + ".dat"), cn(outfile + "_terms.txt");
  o << setprecision(digits) << fixed;
  int n, i, ln, cl = 0, ck = 1e3;
  const string num = "-.0123456789";
  bool start = true, filter = term_check != "";
  boost::regex ckterm(term_check, boost::regex_constants::optimize),
      rm(remove, boost::regex_constants::optimize);
  std::string line, term, value;
  for (; getline(d, line);) {
    for (cl++, term = "", n = line.length(), i = 0; i < n; i++) {
      if (line[i] == sep) break;
      term.push_back(line[i]);
    }
    if (remove != "") term = boost::regex_replace(term, rm, "");
    if (term != "" && n > 100 && i++ < n &&
        (!filter || boost::regex_match(term, ckterm)) &&
        num.find(line[i]) != string::npos)
      try {
        ln = line.length();
        if (ln) {
          cn << term << endl;
          for (start = true, value = ""; i < ln; i++) {
            if (line[i] == sep) {
              if (value != "") {
                if (start) {
                  o << atof(value.c_str());
                  start = false;
                } else
                  o << ' ' << atof(value.c_str());
                value = "";
              }
            } else
              value.push_back(line[i]);
          }
          value == "" ? o << endl : o << ' ' << atof(value.c_str()) << endl;
        }
      } catch (const std::exception &e) {
        Rcout << line << endl;
      }
    if (!--ck) {
      checkUserInterrupt();
      if (verbose) Rcout << "line " << cl << ": " << term << endl;
      ck = 1e3;
    }
  }
}

/**
 * @brief Read embeddings vectors based on term indices.
 *
 * @param indices Integer vector containing the indices of the vectors to load.
 * @param file File containing vectors.
 * @param sep Separator between values.
 * @return NumericVector; term by dimension matrix.
 */

// [[Rcpp::export]]
NumericVector extract_indices(const IntegerVector &indices,
                              const std::string &file, const char &sep = ' ') {
  int nr = indices.length(), ck = 1e3, nc = 0, c = 0, t = 0, p = 0, ln, i;
  std::string line, value;
  ifstream d(file);
  for (getline(d, line), t = p = line.length(), value = ""; p--;) {
    if (line[p] == sep) {
      if (p != t - 1) nc++;
    } else if (!nc)
      value.insert(value.begin(), line[p]);
  }
  nc == 1 ? nc = atoi(value.c_str()) : nc++;
  IntegerVector dims = {nc, nr};
  NumericVector r(nc * nr);
  if (indices[0] == 1) {
    d.seekg(0, d.beg);
    p = 0;
  } else
    p = 1;
  for (t = 0; t < nr;) {
    if (++p == indices[t]) {
      getline(d, line);
      for (value = "", ln = line.length(), c = 0, i = 0; i < ln; i++) {
        if (line[i] == sep) {
          r[t * nc + c++] = atof(value.c_str());
          value = "";
        } else
          value.push_back(line[i]);
      }
      r[t++ * nc + c] = atof(value.c_str());
    } else
      d.ignore(numeric_limits<streamsize>::max(), '\n');
    if (!--ck) {
      checkUserInterrupt();
      ck = 1e3;
    }
  }
  d.close();
  r.attr("dim") = dims;
  return r;
}

// [[Rcpp::depends(RcppParallel)]]
#include <Rcpp.h>
#include <RcppParallel.h>
using namespace std;
using namespace Rcpp;
using namespace RcppParallel;

/**
 * @brief Calculate similarity between two vectors.
 *
 * @param a First numeric vector.
 * @param b Second numeric vector.
 * @param metrics Binary integer vector indicating which metrics to calculate
 * ([Jaccard, Euclidean, Canberra, Cosine, Pearson's]).
 * @return NumericVector; each requested similarity value.
 */

// [[Rcpp::export]]
NumericVector vector_similarity(NumericVector &a, NumericVector &b,
                                const IntegerVector &metrics) {
  const int n = a.length();
  bool ck = true;
  NumericVector op;
  LogicalVector su = is_na(a) | is_na(b) | ((a == 0) & (b == 0));
  if (is_true(all(su)))
    ck = false;
  else if (is_true(any(su))) {
    a = a[!su];
    b = b[!su];
  }
  if (metrics[0]) {
    op.push_back(ck ? (float)sum((a != 0) & (b != 0)) / sum((a != 0) | (b != 0))
                    : NA_REAL,
                 "jaccard");
  }
  if (metrics[1])
    op.push_back(ck ? 1 / (1 + sqrt(sum(pow(a - b, 2)))) : 1, "euclidean");
  if (metrics[2])
    op.push_back(ck ? 1 - sum(abs(a - b) / (abs(a) + abs(b))) / n : NA_REAL,
                 "canberra");
  if (metrics[3]) {
    op.push_back(
        ck ? sum(a * b) / sqrt(sum(pow(a, 2) * sum(pow(b, 2)))) : NA_REAL,
        "cosine");
    if (isnan((float)op["cosine"])) op["cosine"] = NA_REAL;
  }
  if (metrics[4]) {
    const double ma = sum(a) / n, mb = sum(b) / n;
    op.push_back(ck ? (sum(a * b) / n - (ma * mb)) /
                          sqrt(sum(pow(a, 2)) / n - pow(ma, 2)) /
                          sqrt(sum(pow(b, 2)) / n - pow(mb, 2))
                    : NA_REAL,
                 "pearson");
    if (isnan((float)op["pearson"])) op["pearson"] = NA_REAL;
  }
  return op;
}

class Sparse_Arrays {
 public:
  const NumericVector values;
  const IntegerVector dims, i, p;
  const int n = 0, ncol = 0;
  IntegerVector row_starts, row_maps, columns;
  Sparse_Arrays(const S4 &m)
      : values(m.slot("x")),
        dims(m.slot("Dim")),
        i(m.slot("i")),
        p(m.slot("p")),
        n(values.length()),
        ncol(dims[1]),
        row_starts(IntegerVector(dims[0], -1)),
        row_maps(IntegerVector(n, -1)),
        columns(IntegerVector(n)) {
    int index = 0, r = 0, c = p[0];
    unordered_map<int, int> row_steps;
    for (; index < n; index++) {
      r = i[index];
      if (row_starts[r] == -1) {
        row_starts[r] = index;
        row_steps.insert({r, index});
      } else {
        row_maps[row_steps.at(r)] = index;
        row_steps.at(r) = index;
      }
      if (c + 1 < ncol && index == p[c + 1]) {
        c++;
        while (c + 1 < ncol && p[c] == p[c + 1]) c++;
      }
      columns[index] = c;
    }
  }
};

struct Compare : public Worker {
  const Sparse_Arrays a, b;
  const int n, ncol, type, nmetrics = 5;
  RVector<int> aind, bind, metrics;
  RVector<double> jaccard, euclidean, canberra, cosine, pearson;
  Compare(const Sparse_Arrays &a, const Sparse_Arrays &b,
          const IntegerVector &i, const IntegerVector &j, const int &type,
          const IntegerVector &metrics,
          unordered_map<String, NumericVector> &out)
      : a(a),
        b(b),
        n(a.n),
        ncol(a.ncol),
        type(type),
        aind(i),
        bind(j),
        metrics(metrics),
        jaccard(out.at("jaccard")),
        euclidean(out.at("euclidean")),
        canberra(out.at("canberra")),
        cosine(out.at("cosine")),
        pearson(out.at("pearson")) {}
  void operator()(size_t p, size_t final) {
    int r, comp, i, c, bi, bc, l, col;
    double x, bx, dif, sa, sb, sdif, sadif, sse, sne, cp, asq, bsq, sj, si, ma,
        mb;
    for (; p < final; p++) {
      r = aind[p];
      comp = bind[p];
      i = a.row_starts[r];
      c = i == -1 ? -1 : a.columns[i];
      bi = b.row_starts[comp];
      bc = bi == -1 ? -1 : b.columns[bi];
      x = bx = sa = sb = sdif = sadif = sse = sne = cp = asq = bsq = sj = si =
          0;
      for (l = ncol; l--;) {
        col = c;
        if (i == -1 || (bc != -1 && c > bc)) {
          x = 0;
          if (i == -1) c = -1;
        } else {
          x = a.values[i];
          i = a.row_maps[i];
          c = i == -1 ? -1 : a.columns[i];
          if (NumericVector::is_na(x)) continue;
        }
        if (bi == -1 || (col != -1 && bc > col)) {
          bx = 0;
          if (bi == -1) bc = -1;
        } else {
          bx = b.values[bi];
          bi = b.row_maps[bi];
          bc = bi == -1 ? -1 : b.columns[bi];
          if (NumericVector::is_na(bx) || bx + x == 0) continue;
        }
        sa += x;
        sb += bx;
        dif = x - bx;
        sse += pow(dif, 2);
        dif = abs(dif);
        sdif += dif;
        if (x || bx) sne += dif / (abs(x) + abs(bx));
        cp += x * bx;
        asq += pow(x, 2);
        bsq += pow(bx, 2);
        sj += x && bx;
        si += x || bx;
        if (i == -1 && bi == -1) break;
      }
      ma = sa / ncol;
      mb = sb / ncol;
      if (si) {
        if (metrics[0]) jaccard[p] = sj / si;
        if (metrics[2]) canberra[p] = 1 - sne / ncol;
      }
      if (metrics[1]) euclidean[p] = 1 / (1 + sqrt(sse));
      if (sa && sb) {
        if (metrics[3] && asq && bsq) cosine[p] = cp / sqrt(asq) / sqrt(bsq);
        if (metrics[4]) {
          x = (cp / ncol - ma * mb) / sqrt(asq / ncol - pow(ma, 2)) /
              sqrt(bsq / ncol - pow(mb, 2));
          pearson[p] = !isnan(x) ? x : NA_REAL;
        }
      }
    }
  }
};

/**
 * @brief Calculate similarities between vectors within two matrices.
 *
 * @param m Primary sparse matrix.
 * @param comp The second sparse matrix, or NULL if pairwise comparisons within
 * `m` are to be made.
 * @param type Integer indicating which type of comparison to make: 1 = `m` row
 * to specific `comp` row, 2 = all `m` rows to `m` rows, 3 = all `m` rows to all
 * `comp` rows.
 * @param metrics Binary integer vector indicating which metrics to calculate.
 * @return List; a sparse matrix containing similarities for each requested
 * metric.
 */

// [[Rcpp::export]]
List calculate_similarities(const S4 &m, const RObject &comp, int &type,
                            const IntegerVector &metrics) {
  const bool procb = type != 2 && comp.isS4();
  Sparse_Arrays a(m), b = procb ? Sparse_Arrays(as<S4>(comp)) : a;
  const IntegerVector dim{a.dims[0], b.dims[0]};
  size_t nrow = b.dims[0], ai = 1, bi = 0, index = metrics.length();
  const size_t arow = a.dims[0], n = type == 1   ? a.dims[0]
                                     : type == 2 ? arow * (arow - 1) / 2
                                                 : arow * nrow;
  // setting up output vectors
  const CharacterVector metric_names{"jaccard", "euclidean", "canberra",
                                     "cosine", "pearson"};
  unordered_map<String, NumericVector> res;
  for (; index--;)
    res.insert({metric_names[index], NumericVector(metrics[index] ? n : 0)});
  // mapping out row-row comparisons
  IntegerVector aind(n), bind(n), column_starts(nrow + 1);
  column_starts[nrow] = n;
  switch (type) {
    case 1:  // each a row to a single b row or paired b rows
      aind = seq_len(n) - 1;
      if (nrow == n) bind = aind;
      break;
    case 2:  // pairwise between a rows
      for (index = 0; index < n; index++) {
        aind[index] = ai;
        bind[index] = bi;
        if (++ai == nrow) {
          column_starts[++bi] = index + 1;
          ai = bi + 1;
        }
      }
      break;
    case 3:  // each a row with each b row
      ai = 0, nrow = a.dims[0], index = 0;
      for (; index < n; index++) {
        aind[index] = ai;
        bind[index] = bi;
        if (++ai == nrow) {
          column_starts[++bi] = index + 1;
          ai = 0;
        }
      }
      break;
  }
  // making comparisons
  Compare sims(a, b, aind, bind, type, metrics, res);
  checkUserInterrupt();
  parallelFor(0, n, sims);
  // formatting output
  if (a.dims[0] == 1) type = 1;
  List op;
  NumericVector sim;
  for (const String met : metric_names) {
    sim = res.at(met);
    if (sim.length()) {
      if (type == 1) {
        op.push_back(sim, met);
      } else {
        S4 simsm(type == 2 ? "dtCMatrix" : "dgCMatrix");
        simsm.slot("Dim") = dim;
        List dimnames(m.slot("Dimnames")),
            newdimnames{dimnames[0],
                        dim[1] == dim[0] ? dimnames[0] : R_NilValue};
        if (procb) {
          List dimnames2(comp.slot("Dimnames"));
          newdimnames[1] = dimnames2[0];
        }
        simsm.slot("Dimnames") = newdimnames;
        simsm.slot("i") = aind;
        simsm.slot("p") = column_starts;
        simsm.slot("x") = sim;
        if (type == 2) {
          simsm.slot("uplo") = "L";
          simsm.slot("diag") = "U";
        }
        op.push_back(simsm, met);
      }
    }
  }
  return op;
}

// [[Rcpp::depends(BH)]]
#include <Rcpp.h>

#include <fstream>
#define BOOST_REGEX_USE_C_LOCALE
#include <boost/regex.hpp>
using namespace std;
using namespace Rcpp;

/**
 * @brief Count pattern matches within text.
 *
 * @param texts Text to search within.
 * @param patterns Patterns to search for.
 * @param terms Integer vector containing term indices.
 * @param fixed Logical indicating whether patterns are regular expressions or
 * fixed.
 * @param exclusive Logical indicating whether matches should be removed on
 * match (avoiding sub-matches).
 * @return List; a sparse document-term matrix.
 */

// [[Rcpp::export]]
List pattern_search(const CharacterVector &texts,
                    const CharacterVector &patterns, const IntegerVector &terms,
                    const bool &fixed, const bool &exclusive) {
  int cx, r, l, i = 0, n = patterns.length(), tn = texts.length();
  unsigned int p, tp;
  std::string txt;
  IntegerVector dim{tn, (int)patterns.length()}, rows, columns, rowsums(dim[0]);
  NumericVector values;
  if (fixed) {
    for (r = 0; r < tn; r++) {
      checkUserInterrupt();
      txt = texts[r];
      for (i = 0, n = patterns.length(); i < n; i++) {
        l = patterns[i].size(), cx = 0;
        if (l)
          for (std::string::size_type r, p = 0;
               (r = txt.find(patterns[i], p)) != std::string::npos;) {
            if (exclusive) {
              txt.replace(r, l, " ");
              p = r + 1;
            } else
              p = r + l;
            cx += 1;
          }
        if (cx) {
          rows.push_back(r);
          columns.push_back(terms[i]);
          values.push_back(cx);
          rowsums[r] += cx;
        }
      }
    }
  } else {
    vector<boost::regex> pats;
    if (!fixed)
      for (; i < n; i++)
        pats.push_back(
            boost::regex(patterns[i], boost::regex_constants::optimize));
    const unsigned int zero = 0;
    for (r = zero; r < tn; r++) {
      checkUserInterrupt();
      txt = texts[r];
      boost::smatch re;
      for (i = zero, n = pats.size(); i < n; i++) {
        std::string::const_iterator start = txt.cbegin();
        cx = zero;
        if (patterns[i].size())
          for (tp = zero;
               boost::regex_search(start, txt.cend(), re, pats[i]);) {
            p = re.position(zero);
            if (exclusive) {
              txt.replace(tp + p, re.length(), " ");
              start += p + 1;
              tp += p + 1;
            } else
              start += p + re.length();
            cx += 1;
          }
        if (cx) {
          rows.push_back(r);
          columns.push_back(terms[i]);
          values.push_back(cx);
          rowsums[r] += cx;
        }
      }
    }
  }
  S4 dtm("dgTMatrix");
  dtm.slot("Dim") = dim;
  dtm.slot("i") = rows;
  dtm.slot("j") = columns;
  dtm.slot("x") = values;
  return List::create(dtm, rowsums);
}

/**
 * @brief Extract matches to fuzzy terms from text.
 *
 * @param terms Character vector containing regular expression patterns.
 * @param text Text to extract term matches from.
 * @param raw Logical indicating whether `text` is raw text or collapsed terms.
 * @return List; a table of matches and their counts for each term.
 */

// [[Rcpp::export]]
List extract_matches(const CharacterVector &terms,
                     const std::vector<std::string> &text, const bool &raw) {
  const int n = terms.length();
  List res(n);
  string match;
  for (int i = 0, ck = 1e3; i < n; i++) {
    unordered_map<std::string, int> matches;
    const boost::regex re(terms[i]);
    for (const std::string &t : text) {
      if (raw) {
        boost::sregex_iterator search(t.cbegin(), t.cend(), re), search_end;
        for (; search != search_end; search++) {
          match = (*search).str();
          if (matches.find(match) == matches.end()) {
            matches.insert({match, 1});
          } else
            matches[match]++;
        }
      } else {
        if (boost::regex_match(t, re)) {
          if (matches.find(t) == matches.end()) {
            matches.insert({t, 1});
          } else
            matches[t]++;
        }
      }
      if (!--ck) {
        checkUserInterrupt();
        ck = 1e3;
      }
    }
    res[i] = matches;
  }
  return res;
}

1		#' Read/Write Dictionary Files
2		#'
3		#' Read in or write dictionary files in Comma-Separated Values (.csv; weighted) or
4		#' Linguistic Inquiry and Word Count (.dic; non-weighted) format.
5		#' @param path Path to a file, a name corresponding to a file in \code{getOption('lingmatch.dict.dir')}
6		#' (or \code{'~/Dictionaries'}) or one of the dictionaries available at \href{https://osf.io/y6g5b}{osf.io/y6g5b},
7		#' a matrix-like object to be categorized, or a list to be formatted.
8		#' @param cats A character vector of category names to be returned. All categories are returned by default.
9		#' @param type A character indicating whether and how terms should be altered. Unspecified or matching 'asis'
10		#' leaves terms as they are. Other options change wildcards to regular expressions:
11		#' \code{'pattern'} (\code{'^[poi]'}) replaces initial asterisks with \code{'\\\\b\\\\w*'},
12		#' and terminal asterisks with \code{'\\\\w*\\\\b'}, to match terms within raw text;
13		#' for anything else, terms are padded with \code{^} and \code{$}, then those bounding marks are removed
14		#' when an asterisk is present, to match tokenized terms.
15		#' @param as.weighted Logical; if \code{TRUE}, prevents weighted dictionaries from being converted to
16		#' unweighted versions, or converts unweighted dictionaries to a binary weighted version
17		#' -- a data.frame with a "term" column of unique terms, and a column for each category.
18		#' @param dir Path to a folder containing dictionaries, or where you would like dictionaries to be downloaded;
19		#' passed to \code{\link{select.dict}} and/or \code{\link{download.dict}}.
20		#' @param ... Passes arguments to \code{\link{readLines}}.
21		#' @param term.name,category.name Strings identifying column names in \code{path} containing terms and categories
22		#' respectively.
23		#' @param raw Logical or a character. As logical, indicates if \code{path} should be treated
24		#' as a raw dictionary (as might be read in from a .dic file). As a character, replaces \code{path}
25		#' as if it were read in from a file.
26		#' @return \code{read.dic}: A \code{list} (unweighted) with an entry for each category containing
27		#' character vectors of terms, or a \code{data.frame} (weighted) with columns for terms (first, "term") and
28		#' weights (all subsequent, with category labels as names).
29		#' @family Dictionary functions
30		#' @importFrom utils read.table write.table
31		#' @export
32
33		read.dic <- function(path, cats = NULL, type = "asis", as.weighted = FALSE, dir = getOption("lingmatch.dict.dir"), ...,
34		term.name = "term", category.name = "category", raw = FALSE) {
35	60x	if (ckd <- dir == "") dir <- "~/Dictionaries"
36	62x	if (!is.logical(raw)) {
37	1x	path <- raw
38	1x	raw <- TRUE
39		}
40	!	if (missing(path)) path <- file.choose()
41	62x	if (!raw) {
42	61x	if (is.character(path) && !any(file.exists(path)) &&
43	61x	any(file.exists(normalizePath(paste0(dir, "/", path), "/", FALSE)))) {
44	2x	path <- normalizePath(paste0(dir, "/", path), "/", FALSE)
45		}
46	61x	if (is.character(path) && !any(file.exists(path))) {
47	3x	tp <- select.dict(path, dir = if (ckd) "" else dir)
48	3x	if (nrow(tp$selected) && length(path) <= nrow(tp$info)) {
49	2x	if (any(tp$selected$downloaded == "")) {
50	!	td <- rownames(tp$selected)[tp$selected$downloaded == ""]
51	!	if (!ckd && grepl("^$\|^[yt1]\|^ent", readline(paste0(
52	!	"would you like to download ", if (length(td) == 1) "this dictionary" else "these dictionaries", "?:\n",
53	!	sub(",(?=[^,]+$)", if (length(td) == 2) " and" else ", and", paste0(td, collapse = ", "), perl = TRUE),
54	!	"\n(press Enter for yes): "
55		)))) {
56	!	tp$selected[td, "downloaded"] <- unlist(download.dict(td, dir = dir), use.names = FALSE)
57		}
58		}
59	2x	path <- tp$selected[tp$selected[, "downloaded"] != "", "downloaded"]
60	2x	if (!length(path)) {
61	!	stop(
62	!	if (nrow(tp$selected) == 1) "dictionary" else "dictionaries", " (",
63	!	paste(rownames(tp$selected), collapse = ", "), ") not found in dir (", dir, ")",
64	!	if (ckd) '\nspecify a directory (e.g., dir = "~") to locate or download; see ?download.dict',
65	!	call. = FALSE
66		)
67		}
68		}
69		}
70	61x	if (is.character(path) && length(path) > 1 && any(file.exists(path))) {
71	2x	dicts <- list()
72	2x	for (p in path) {
73	4x	if (file.exists(p)) {
74	4x	name <- gsub("^.*[/\\]+\|\\.[^.]+$", "", p)
75	4x	dicts[[name]] <- read.dic(p)
76		}
77		}
78	2x	path <- if (length(dicts) == 1) dicts[[1]] else dicts
79		}
80		}
81	62x	if (!is.null(dim(path))) {
82	1x	if (anyNA(path)) path[is.na(path)] <- 0
83	33x	cols <- colnames(path)
84	33x	if (term.name %in% cols) {
85	23x	terms <- path[, term.name]
86	23x	cols <- cols[cols != term.name]
87	10x	} else if (!is.null(rownames(path)) && any(grepl("[a-z]", rownames(path), TRUE))) {
88	1x	terms <- rownames(path)
89		} else {
90	9x	su <- which(vapply(cols, function(col) !is.numeric(path[, col]), TRUE))
91	9x	if (!length(su)) {
92	2x	if (!is.null(colnames(path))) {
93	1x	path <- data.frame(term = colnames(path), t(path), stringsAsFactors = FALSE)
94	1x	terms <- path$term
95	1x	cols <- colnames(path)[-1]
96	1x	if (missing(as.weighted)) as.weighted <- TRUE
97		} else {
98	1x	stop("no non-numeric columns found in path")
99		}
100		} else {
101	7x	if (length(su) > 1) {
102	4x	ssu <- vapply(su, function(col) {
103	9x	if (!anyDuplicated(path[, col])) {
104	6x	1
105	2x	} else if (all(path[, col] == path[1, col])) 0 else 2
106	4x	}, 0)
107	4x	if (length(su) == ncol(path) && !any(ssu == 0)) {
108	2x	path <- data.frame(
109	2x	term = unlist(path[, su], use.names = FALSE),
110	2x	category = rep(colnames(path), each = nrow(path))
111		)
112	2x	category.name <- cols <- "category"
113	2x	su <- 1
114		} else {
115	2x	if (any(ssu == 2)) {
116	!	cols <- colnames(path)[su[which(ssu == 2)]]
117	!	if (length(cols) > 1 && length(su) != ncol(path)) cols <- cols[1]
118		}
119	2x	su <- if (any(ssu == 1)) su[which(ssu == 1)[[1]]] else su[[1]]
120		}
121		}
122	7x	terms <- path[, su]
123		}
124		}
125	32x	if (category.name %in% colnames(path)) {
126	4x	su <- which(vapply(cols, function(col) is.numeric(path[, col]), TRUE))
127	4x	cols <- path[, category.name]
128	4x	if (length(su) == 1) {
129	1x	weights <- path[, names(su)]
130	1x	wl <- data.frame(term = terms)
131	1x	v <- numeric(nrow(path))
132	1x	for (cat in unique(cols)) {
133	2x	su <- cols == cat
134	2x	v[su] <- weights[su]
135	2x	wl[, cat] <- v
136	2x	v[] <- 0
137		}
138		} else {
139	3x	wl <- split(terms, cols)
140		}
141		} else {
142	28x	su <- vapply(cols, function(col) {
143	78x	if (is.numeric(path[, col])) {
144	70x	1
145	8x	} else if (anyDuplicated(path[, col])) {
146	1x	if (!all(path[, col] == path[1, col])) 2 else 3
147		} else {
148	7x	0
149		}
150	28x	}, 0)
151	28x	wl <- if (!1 %in% su && any(su == 2) && any(su != 2)) {
152	!	cols <- cols[su == 2]
153	!	wl <- lapply(cols, function(col) split(terms, path[, col]))
154	!	names(wl) <- cols
155	!	wl
156		} else {
157	28x	if (!any(su == 1)) {
158	1x	if (any(su > 1)) {
159	1x	cols <- cols[su > 1]
160	1x	if (length(cols) == 1) {
161	1x	split(terms, path[, cols])
162		} else {
163	!	wl <- lapply(cols, function(col) split(terms, path[, col]))
164	!	names(wl) <- cols
165	!	unlist(wl, FALSE)
166		}
167		} else {
168	!	stop("no numeric columns found in path")
169		}
170	27x	} else if (as.weighted) {
171	13x	cbind(term = terms, path[, cols[su == 1], drop = FALSE])
172		} else {
173	14x	cols <- cols[su == 1]
174	14x	if (length(cols) == 1) {
175	4x	weights <- path[, cols]
176	4x	if (any(weights < 0) && any(weights > 0)) {
177	3x	Filter(length, list(
178	3x	positive = terms[weights > 0],
179	3x	neutral = terms[weights == 0],
180	3x	negative = terms[weights < 0]
181		))
182	!	} else if (anyDuplicated(weights)) split(terms, weights) else list(category = terms)
183		} else {
184	10x	weights <- as.data.frame(path[, cols], stringsAsFactors = FALSE)
185	10x	if (any(weights > 0) && any(weights < 0)) {
186	3x	for (col in cols) {
187	6x	if (any(weights[, col] > 0)) weights[, paste0(col, ".positive")] <- as.integer(weights[, col] > 0)
188	4x	if (any(weights[, col] == 0)) weights[, paste0(col, ".neutral")] <- as.integer(weights[, col] == 0)
189	4x	if (any(weights[, col] < 0)) weights[, paste0(col, ".negative")] <- as.integer(weights[, col] < 0)
190	6x	weights <- weights[, colnames(weights) != col]
191		}
192	3x	cols <- sort(colnames(weights))
193		}
194	10x	lvs <- sort(unique(unlist(weights)))
195	10x	if (length(lvs) == 2 && all(lvs == c(0, 1))) {
196	8x	wl <- lapply(cols, function(col) terms[weights[, col] == 1])
197	8x	names(wl) <- cols
198	8x	wl <- Filter(length, wl)
199		} else {
200	2x	wl <- split(terms, colnames(weights)[max.col(weights, "first")])
201		}
202	10x	wl
203		}
204		}
205		}
206		}
207	29x	} else if (is.list(path)) {
208	13x	path <- Filter(length, path)
209	13x	if (all(vapply(path, function(d) is.character(d) \|\| is.factor(d), TRUE))) {
210	5x	wl <- path
211	2x	if (is.null(names(wl))) names(wl) <- paste0("cat", seq_along(wl))
212	5x	if (!is.null(cats)) {
213	2x	wl <- wl[names(wl) %in% cats]
214	!	if (!length(wl)) stop("no cats were found in path")
215		}
216		} else {
217	4x	if (is.null(names(path))) names(path) <- paste0("dict", seq_along(path))
218	8x	wl <- if (any(vapply(path, function(d) !is.null(dim(d)), TRUE))) {
219	5x	terms <- NULL
220	5x	cols <- NULL
221	5x	for (d in names(path)) {
222	10x	path[[d]] <- read.dic(path[[d]], as.weighted = as.weighted)
223	10x	if (as.weighted) {
224	2x	terms <- unique(c(terms, path[[d]]$term))
225	2x	cols <- c(cols, paste0(d, ".", colnames(path[[d]])[-1]))
226		}
227		}
228	5x	if (as.weighted) {
229	1x	wl <- as.data.frame(
230	1x	matrix(0, length(terms), length(cols), dimnames = list(terms, cols)),
231	1x	stringsAsFactors = FALSE
232		)
233	1x	for (d in names(path)) {
234	2x	cols <- colnames(path[[d]])[-1] <- paste0(d, ".", colnames(path[[d]])[-1])
235	2x	su <- duplicated(path[[d]]$term)
236	2x	if (any(su)) {
237	1x	su <- path[[d]]$term %in% path[[d]]$term[su]
238	1x	td <- path[[d]][su, , drop = FALSE]
239	1x	for (term in unique(td$term)) wl[term, cols] <- colMeans(td[td$term == term, cols])
240		}
241	2x	if (any(!su)) path[[d]] <- path[[d]][!su, ]
242	2x	rownames(path[[d]]) <- path[[d]]$term
243	2x	wl[path[[d]]$term, cols] <- path[[d]][, cols]
244		}
245	1x	data.frame(term = rownames(wl), wl, stringsAsFactors = FALSE)
246		} else {
247	4x	unlist(path, FALSE)
248		}
249	8x	} else if (any(vapply(path, is.list, TRUE))) {
250	1x	unlist(path, FALSE)
251		} else {
252	2x	if (any(vapply(path, function(d) is.null(names(d)), TRUE))) {
253	1x	if (all(vapply(path, length, 0) == length(path[[1]]))) {
254	1x	data.frame(term = names(path), do.call(rbind, path), stringsAsFactors = FALSE)
255		} else {
256	!	stop("failed to resolve path; as a list, entries should contain character or named numeric vectors")
257		}
258		} else {
259	1x	terms <- unique(unlist(lapply(path, names), use.names = FALSE))
260	1x	v <- structure(numeric(length(terms)), names = terms)
261	1x	data.frame(term = terms, vapply(path, function(d) {
262	2x	v[names(d)] <- d
263	2x	v
264	1x	}, numeric(length(terms))), stringsAsFactors = FALSE)
265		}
266		}
267		}
268		} else {
269	16x	if (raw \|\| length(path) != 1) {
270	1x	if (length(path) == 1) path <- strsplit(path, "\n")[[1]]
271	2x	di <- path
272		} else {
273	14x	di <- tryCatch(readLines(path, warn = FALSE, ...), error = function(e) NULL)
274	1x	if (is.null(di)) stop("assumed path (", path, ") is to a file, but failed to read it in", call. = FALSE)
275		}
276	15x	lst <- grep("%", di, fixed = TRUE)
277	15x	if (length(lst) > 1 && !grepl(",", di[lst[1]], fixed = TRUE)) {
278	7x	if (length(lst) < 2) {
279	!	stop(
280	!	"could not identify the end of the header -- ",
281	!	"this should be the second percent sign (%) following the last category name."
282		)
283		}
284	7x	lst <- lst[2]
285	7x	h <- grep("^\\d", gsub("^\\s+\|\\s%+\\s\|\\s+$", "", di[seq_len(lst)]), value = TRUE)
286	7x	ci <- character()
287	7x	for (cat in h) ci[sub("\\s.*$", "", cat)] <- sub("^[^\\s]+\\s+", "", cat, perl = TRUE)
288	7x	if (is.null(cats)) cats <- ci
289	7x	sep <- if (grepl("\t", di[lst + 1], fixed = TRUE)) "\t" else "\\s"
290	7x	cb <- paste0("(?:", sep, "+(?:", paste(names(ci), collapse = "\|"), ")(?=", sep, "\|$))*$")
291	7x	di <- di[-seq_len(lst - 1)]
292	7x	wl <- lapply(structure(names(ci), names = ci), function(cat) {
293	52x	unique(sub(cb, "", di[grep(paste0(sep, cat, cb), di, perl = TRUE)], perl = TRUE))
294		})
295	7x	wl <- wl[cats[cats %in% names(wl)]]
296		} else {
297	6x	if (missing(as.weighted) && length(path) == 1) as.weighted <- TRUE
298	8x	wl <- if (any(grepl("[\\s,]", di, perl = TRUE))) {
299	7x	di <- read.table(
300	7x	text = di, header = TRUE, sep = if (grepl("\t", di[[1]])) "\t" else ",",
301	7x	quote = '"', comment.char = "", stringsAsFactors = FALSE
302		)
303	7x	if (!missing(as.weighted) \|\| (!term.name %in% colnames(di) && !any(vapply(di, is.character, TRUE)) &&
304	7x	!any(grepl("[a-z]", rownames(di), TRUE)))) {
305	7x	di <- tryCatch(
306	7x	read.dic(di, cats = cats, type = type, as.weighted = as.weighted),
307	7x	error = function(e) e$message
308		)
309		}
310	7x	di
311		} else {
312	1x	list(cat1 = di)
313		}
314	8x	if (length(wl) == 1 && is.character(wl)) {
315	!	stop("assuming path is to a comma separated values file, but failed to read it in:\n", wl)
316		}
317		}
318		}
319	1x	if (!missing(type) && !grepl("^[Aa]", type)) wl <- to_regex(wl, grepl("^[poi]", type, TRUE))
320	60x	if (is.null(dim(wl)) && !is.null(cats)) {
321	9x	su <- names(wl) %in% cats
322	9x	if (any(su)) wl <- wl[su]
323		}
324	60x	if (as.weighted && is.null(dim(wl))) {
325	1x	op <- data.frame(term = unique(unlist(wl)), stringsAsFactors = FALSE)
326	1x	for (cat in names(wl)) op[, cat] <- as.integer(op$term %in% wl[[cat]])
327	1x	op
328		} else {
329	59x	if (is.data.frame(wl) && !as.weighted && !missing(as.weighted)) {
330	!	read.dic(wl, cats = cats, as.weighted = FALSE)
331		} else {
332	59x	wl
333		}
334		}
335		}
336
337		#' @rdname read.dic
338		#' @param dict A \code{list} with a named entry of terms for each category, or a \code{data.frame}
339		#' with terms in one column, and categories or weights in the rest.
340		#' @param filename The name of the file to be saved.
341		#' @param save Logical: if \code{FALSE}, does not write a file.
342		#' @return \code{write.dic}: A version of the written dictionary -- a raw character vector for
343		#' unweighted dictionaries, or a \code{data.frame} for weighted dictionaries.
344		#' @examples
345		#' # make a small murder related dictionary
346		#' dict <- list(
347		#' kill = c("kill", "murd", "wound", "die"),
348		#' death = c("death", "dying", "die", "kill*")
349		#' )
350		#'
351		#' # convert it to a weighted format
352		#' (dict_weighted <- read.dic(dict, as.weighted = TRUE))
353		#'
354		#' # categorize it back
355		#' read.dic(dict_weighted)
356		#'
357		#' # convert it to a string without writing to a file
358		#' cat(raw_dict <- write.dic(dict, save = FALSE))
359		#'
360		#' # parse it back in
361		#' read.dic(raw = raw_dict)
362		#'
363		#' \dontrun{
364		#'
365		#' # save it as a .dic file
366		#' write.dic(dict, "murder")
367		#'
368		#' # read it back in as a list
369		#' read.dic("murder.dic")
370		#'
371		#' # read in the Moral Foundations or LUSI dictionaries from urls
372		#' moral_dict <- read.dic("https://osf.io/download/whjt2")
373		#' lusi_dict <- read.dic("https://osf.io/download/29ayf")
374		#'
375		#' # save and read in a version of the General Inquirer dictionary
376		#' inquirer <- read.dic("inquirer", dir = "~/Dictionaries")
377		#' }
378		#' @export
379
380		write.dic <- function(dict, filename = NULL, type = "asis", as.weighted = FALSE, save = TRUE) {
381	5x	if (!is.list(dict) \|\| is.data.frame(dict)) {
382	2x	if (save && (missing(as.weighted) \|\| as.weighted)) {
383	2x	as.weighted <- TRUE
384	2x	o <- dict
385		} else {
386	!	dict <- read.dic(dict)
387		}
388		}
389	5x	if (is.null(dim(dict))) {
390	3x	terms <- unique(as.character(unlist(dict, use.names = FALSE)))
391	3x	terms <- terms[terms != ""]
392	!	if (!missing(type) && !grepl("^[Aa]", type)) dict <- to_regex(dict, grepl("^[poi]", type, TRUE))
393	3x	if (as.weighted) {
394	1x	o <- data.frame(term = terms, stringsAsFactors = FALSE)
395	1x	for (cat in names(dict)) o[, cat] <- as.integer(o$term %in% dict[[cat]])
396		} else {
397	2x	l <- length(dict)
398	2x	m <- as.data.frame(matrix("", length(terms) + l + 2, l + 1), stringsAsFactors = FALSE)
399	2x	m[, 1] <- c("%", seq_len(l), "%", terms)
400	2x	m[seq_len(l) + 1, 2] <- if (is.null(names(dict))) seq_len(l) else names(dict)
401	2x	for (i in seq_along(dict)) m[which(m[-seq_len(l + 2), 1] %in% dict[[i]]) + l + 2, i + 1] <- i
402	2x	o <- gsub("\t{2,}", "\t", paste(sub("\t+$", "", do.call(paste, c(m, sep = "\t"))), collapse = "\n"))
403		}
404		}
405	5x	if (save && is.character(filename)) {
406	4x	filename <- filename[[1]]
407	!	if (!grepl("\\.[^.]+$", filename)) filename <- paste0(filename, if (as.weighted) ".csv" else ".dic")
408	4x	if (as.weighted) {
409	3x	write.table(o, filename, sep = ",", row.names = FALSE, qmethod = "double")
410		} else {
411	1x	write(o, filename)
412		}
413	4x	message("dictionary saved to ", filename)
414		}
415	5x	invisible(o)
416		}

1		#' Generate a Report of Term Matches
2		#'
3		#' Extract matches to fuzzy terms (globs/wildcards or regular expressions) from provided text, in order
4		#' to assess their appropriateness for inclusion in a dictionary.
5		#' @param dict A vector of terms, list of such vectors, or a matrix-like object to be
6		#' categorized by \code{\link{read.dic}}.
7		#' @param text A vector of text to extract matches from. If not specified, will use the terms
8		#' in the \code{term_map} retrieved from \code{\link{select.lspace}}.
9		#' @param space A vector space used to calculate similarities between term matches.
10		#' Name of a the space (see \code{\link{select.lspace}}), a matrix with terms as row names, or
11		#' \code{TRUE} to auto-select a space based on matched terms.
12		#' @param glob Logical; if \code{TRUE}, converts globs (asterisk wildcards) to regular expressions.
13		#' If not specified, this will be set automatically.
14		#' @param parse_phrases Logical; if \code{TRUE} (default) and \code{space} is specified, will
15		#' break unmatched phrases into single terms, and average across and matched vectors.
16		#' @param tolower Logical; if \code{FALSE}, will retain \code{text}'s case.
17		#' @param punct Logical; if \code{FALSE}, will remove punctuation markings in \code{text}.
18		#' @param special Logical; if \code{FALSE}, will attempt to replace special characters in \code{text}.
19		#' @param as_terms Logical; if \code{TRUE}, will treat \code{text} as terms, meaning \code{dict}
20		#' terms will only count as matches when matching the complete text.
21		#' @param bysentence Logical; if \code{TRUE}, will split \code{text} into sentences, and only
22		#' consider unique sentences.
23		#' @param as_string Logical; if \code{FALSE}, returns matches as tables rather than a string.
24		#' @param term_map_freq Proportion of terms to include when using the term map as a source
25		#' of terms. Applies when \code{text} is not specified.
26		#' @param term_map_spaces Number of spaces in which a term has to appear to be included.
27		#' Applies when \code{text} is not specified.
28		#' @param outFile File path to write results to, always ending in \code{.csv}.
29		#' @param space_dir Directory from which \code{space} should be loaded.
30		#' @param verbose Logical; if \code{FALSE}, will not display status messages.
31		#' @family Dictionary functions
32		#' @seealso For a more complete assessment of dictionaries, see \code{\link{dictionary_meta}()}.
33		#'
34		#' Similar information is provided in the \href{https://miserman.github.io/dictionary_builder/}{dictionary builder} web tool.
35		#' @note
36		#' Matches are extracted for each term independently, so they may not align with some implementations
37		#' of dictionaries. For instance, by default \code{\link{lma_patcat}} matches destructively, and sorts
38		#' terms by length such that shorter terms will not match the same text and longer terms that overlap.
39		#' Here, the match would show up for both terms.
40		#' @returns A \code{data.frame} of results, with a row for each unique term, and the following columns:
41		#' \itemize{
42		#' \item \strong{\code{term}}: The originally entered term.
43		#' \item \strong{\code{regex}}: The converted and applied regular expression form of the term.
44		#' \item \strong{\code{categories}}: Comma-separated category names,
45		#' if \code{dict} is a list with named entries.
46		#' \item \strong{\code{count}}: Total number of matches to the term.
47		#' \item \strong{\code{max_count}}: Number of matches to the most representative
48		#' (that with the highest average similarity) variant of the term.
49		#' \item \strong{\code{variants}}: Number of variants of the term.
50		#' \item \strong{\code{space}}: Name of the latent semantic space, if one was used.
51		#' \item \strong{\code{mean_sim}}: Average similarity to the most representative variant among terms
52		#' found in the space, if one was used.
53		#' \item \strong{\code{min_sim}}: Minimal similarity to the most representative variant.
54		#' \item \strong{\code{matches}}: Variants, with counts and similarity (Pearson's r) to the
55		#' most representative term (if a space was specified). Either in the form of a comma-separated
56		#' string or a \code{data.frame} (if \code{as_string} is \code{FALSE}).
57		#' }
58		#' @examples
59		#' text <- c(
60		#' "I am sadly homeless, and suffering from depression :(",
61		#' "This wholesome happiness brings joy to my heart! :D:D:D",
62		#' "They are joyous in these fearsome happenings D:",
63		#' "I feel weightless now that my sadness has been depressed! :()"
64		#' )
65		#' dict <- list(
66		#' sad = c("less", "sad", "depres*", ":("),
67		#' happy = c("some", "happ", "joy*", "d:"),
68		#' self = c("i ", "my ")
69		#' )
70		#'
71		#' report_term_matches(dict, text)
72		#' @export
73
74		report_term_matches <- function(dict, text = NULL, space = NULL, glob = TRUE,
75		parse_phrases = TRUE, tolower = TRUE, punct = TRUE, special = TRUE,
76		as_terms = FALSE, bysentence = FALSE, as_string = TRUE,
77		term_map_freq = 1, term_map_spaces = 1, outFile = NULL,
78		space_dir = getOption("lingmatch.lspace.dir"), verbose = TRUE) {
79	!	if (missing(dict)) stop("dict must be specified", call. = FALSE)
80	7x	collapsed_terms <- FALSE
81	7x	if (is.null(text)) {
82	1x	term_map <- select.lspace(dir = space_dir, get.map = TRUE)$term_map
83	1x	if (is.null(term_map)) {
84	!	stop(
85	!	"term map not found; specify `space_dir` or provide text",
86	!	call. = FALSE
87		)
88		}
89	1x	if (term_map_freq > 0 && term_map_freq < 1) {
90	!	term_map <- term_map[seq(1, ceiling(nrow(term_map) * term_map_freq)), ]
91		}
92	1x	if (term_map_spaces > 0 && term_map_spaces < ncol(term_map)) {
93	1x	term_map <- term_map[rowSums(term_map != 0) >= term_map_spaces, ]
94		}
95	1x	collapsed_terms <- as_terms <- TRUE
96	1x	text <- paste(rownames(term_map), collapse = " ")
97		}
98	!	if (is.null(text) && is.null(space)) stop("either text or space must be specified", call. = FALSE)
99	7x	st <- proc.time()[[3]]
100	7x	if (!is.null(text) && !as_terms) {
101	5x	if (verbose) cat("preparing text (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
102	!	if (bysentence) text <- read.segments(text, segment.size = 1, bysentence = TRUE)$text
103	5x	if (tolower) text <- tolower(text)
104	!	if (!punct) text <- gsub("[,_:;/\\\\.?!\"()\\{}[]\|\\]", " ", text)
105	!	if (!special) text <- lma_dict("special", as.function = gsub)(text)
106	5x	text <- unique(text)
107		}
108	7x	if (verbose) cat("preparing dict (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
109	!	if (!is.null(dim(dict)) \|\| (is.character(dict) && length(dict) == 1 && file.exists(dict))) dict <- read.dic(dict)
110	7x	terms <- data.frame(term = unique(unlist(dict, use.names = FALSE)))
111	7x	if (missing(glob)) {
112	7x	glob <- any(grepl("^\\*", terms$term))
113	7x	if (!glob) {
114	2x	glob <- any(grepl("\\*$", terms$term))
115	1x	if (glob) glob <- !any(grepl("(?:\\\\\\w\|[].})])\\*$", terms$term))
116		}
117		}
118	7x	rawtext <- !as_terms \|\| collapsed_terms
119	7x	terms$regex <- to_regex(list(terms$term), TRUE, glob)[[1]]
120	7x	terms <- terms[!is.na(terms$regex) & terms$regex != "", ]
121	7x	terms$regex <- if (rawtext) paste0("\\b", terms$regex, "\\b") else paste0("^", terms$regex, "$")
122	7x	if (is.list(dict)) {
123	!	if (is.null(names(dict))) names(dict) <- paste0("cat_", seq_along(dict))
124	5x	categories <- character(nrow(terms))
125	5x	for (cat in names(dict)) {
126	20x	su <- terms$term %in% dict[[cat]]
127	20x	if (any(su)) {
128	20x	ssu <- su & categories != ""
129	20x	categories[ssu] <- paste0(categories[ssu], ", ", cat)
130	20x	categories[su & categories == ""] <- cat
131		}
132		}
133	5x	terms$categories <- categories
134		}
135	7x	if (verbose) cat("extracting matches (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
136	7x	matches <- extract_matches(terms$regex, text, rawtext)
137	7x	has_space <- FALSE
138	7x	space_name <- NULL
139	7x	if (!is.null(space)) {
140	2x	obs <- unique(unlist(lapply(matches, names), use.names = FALSE))
141	2x	if (is.logical(space) && space) {
142	1x	space <- select.lspace(terms = obs)$selected
143	1x	space <- if (nrow(space)) rownames(space)[1] else NULL
144		}
145	2x	if (is.character(space)) {
146	2x	if (verbose) cat("loading space (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
147	2x	space_name <- space
148	2x	space <- lma_lspace(obs, space, dir = space_dir)
149		}
150	!	if (!nrow(space) \|\| !any(obs %in% rownames(space))) space <- NULL
151	2x	if (is.null(space)) {
152	!	warning("failed to recognize space")
153		} else {
154	2x	su <- obs %in% rownames(space)
155	2x	if (parse_phrases && any(!su)) {
156	2x	phrase <- grepl("[ _/-]", obs)
157	2x	if (any(phrase)) {
158	2x	if (verbose) cat("parsing phrases (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
159	2x	split_parts <- strsplit(obs[phrase], "[ _/-]")
160	2x	parts <- unique(unlist(split_parts, use.names = FALSE))
161	2x	part_vectors <- if (is.null(space_name)) {
162	!	if (any(parts %in% rownames(space))) space[parts[parts %in% rownames(space)]] else space[0, ]
163		} else {
164	2x	lma_lspace(parts, space_name)
165		}
166	2x	if (nrow(part_vectors)) {
167	2x	space_terms <- rownames(part_vectors)
168	2x	space_dim <- ncol(space)
169	2x	names(split_parts) <- obs[phrase]
170	2x	agg_vectors <- t(vapply(split_parts, function(p) {
171	30x	su <- p %in% space_terms
172	30x	if (any(su)) {
173	30x	colMeans(part_vectors[p[su], , drop = FALSE])
174		} else {
175	!	numeric(space_dim)
176		}
177	2x	}, numeric(space_dim)))
178	2x	space <- rbind(space, agg_vectors[rowSums(agg_vectors) != 0, ])
179		}
180		}
181		}
182	!	if (is.null(space_name)) space_name <- "custom"
183	2x	has_space <- TRUE
184		}
185		}
186	7x	if (verbose) cat("preparing results (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
187	7x	terms <- cbind(terms, do.call(rbind, lapply(matches, function(m) {
188	58x	hits <- if (length(m)) {
189	6x	if (collapsed_terms) m[!is.na(m) & m != 0] <- 1L
190	37x	if (has_space) {
191	15x	msim <- m
192	15x	if (length(m) == 1) {
193	1x	msim[] <- 1L
194		} else {
195	14x	msim[] <- NA
196	14x	su <- names(m) %in% rownames(space)
197	14x	if (sum(su) == 1) {
198	!	msim[su] <- 1L
199	14x	} else if (any(su)) {
200	14x	ns <- names(m)[su]
201	14x	sims <- lma_simets(space[ns, ], metric = "pearson", symmetrical = TRUE)
202	14x	msim[su] <- as.numeric(sims[, which.max(colMeans(sims))])
203		}
204		}
205	15x	o <- order(msim, -nchar(names(m)), decreasing = TRUE)
206	15x	m <- m[o]
207	15x	if (as_string) {
208	15x	paste(paste0(names(m), " (", if (!as_terms) paste0(m, ", "), round(msim[o], 2), ")"), collapse = ", ")
209		} else {
210	!	list(as.data.frame(rbind(m, msim[o])))
211		}
212		} else {
213	22x	m <- m[order(m, -nchar(names(m)), decreasing = TRUE)]
214	22x	if (as_string) {
215	13x	paste(paste0(names(m), if (!as_terms) paste0(" (", m, ")")), collapse = ", ")
216		} else {
217	9x	list(t(as.data.frame(m)))
218		}
219		}
220		} else {
221	7x	if (has_space) msim <- NA
222	2x	if (as_string) "" else list(data.frame())
223		}
224	11x	if (!as_string) rownames(hits[[1]]) <- NULL
225	58x	res <- if (collapsed_terms) {
226	11x	data.frame(
227	11x	count = length(m),
228	11x	max_count = if (all(is.na(m))) 0L else 1L,
229	11x	variants = length(m)
230		)
231		} else {
232	47x	data.frame(
233	47x	count = sum(m),
234	47x	max_count = if (all(is.na(m))) 0L else max(m, na.rm = TRUE),
235	47x	variants = length(m)
236		)
237		}
238	58x	if (has_space) {
239	22x	res$space <- space_name
240	22x	if (all(is.na(msim))) {
241	7x	res$mean_sim <- res$min_sim <- NA
242		} else {
243	15x	res$mean_sim <- mean(msim, na.rm = TRUE)
244	15x	res$min_sim <- min(msim, na.rm = TRUE)
245		}
246		}
247	58x	res[["matches"]] <- hits
248	58x	res
249		})))
250	7x	terms <- terms[if (has_space) order(terms$mean_sim) else order(terms$variants, decreasing = TRUE), ]
251	7x	rownames(terms) <- NULL
252	7x	if (!is.null(outFile)) {
253	1x	outFile <- paste0(sub(".csv", "", outFile, fixed = TRUE), ".csv")
254	1x	if (verbose) cat("writing results: ", outFile, " (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
255	1x	write.table(terms, outFile, sep = ",", row.names = FALSE, qmethod = "double")
256		}
257	7x	if (verbose) cat("done (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
258	7x	terms
259		}

1		#' Download Dictionaries
2		#'
3		#' Downloads the specified dictionaries from \href{https://osf.io/y6g5b}{osf.io/y6g5b}.
4		#'
5		#' @param dict One or more names of dictionaries to download, or \code{'all'} for all available. See
6		#' \href{https://osf.io/y6g5b/wiki/home}{osf.io/y6g5b/wiki} for more information, and a list of available dictionaries.
7		#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF,
8		#' and compares it with that calculated from the downloaded file to check its integrity.
9		#' @param mode A character specifying the file write mode; default is 'wb'. See
10		#' \code{\link{download.file}}.
11		#' @param dir Directory in which to save the dictionary; \cr default is \code{getOption('lingmatch.dict.dir')}. \cr
12		#' This must be specified, or the option must be set -- use \code{\link{lma_initdirs}} to initialize a directory.
13		#' @param overwrite Logical; if \code{TRUE}, will replace existing files.
14		#' @return Path to the downloaded dictionary, or a list of such if multiple were downloaded.
15		#' @family Dictionary functions
16		#' @examples
17		#' \dontrun{
18		#'
19		#' download.dict("lusi", dir = "~/Dictionaries")
20		#' }
21		#' @export
22
23		download.dict <- function(
24		dict = "lusi", check.md5 = TRUE, mode = "wb", dir = getOption("lingmatch.dict.dir"),
25		overwrite = FALSE) {
26	1x	download.resource(
27	1x	"dict", dict,
28	1x	check.md5 = check.md5, mode = mode, dir = dir, overwrite = overwrite
29		)
30		}

1		#' Categorize Texts
2		#'
3		#' Categorize raw texts using a pattern-based dictionary.
4		#'
5		#' @param text A vector of text to be categorized. Texts are padded by 2 spaces, and potentially lowercased.
6		#' @param dict At least a vector of terms (patterns), usually a matrix-like object with columns for terms,
7		#' categories, and weights.
8		#' @param pattern.weights A vector of weights corresponding to terms in \code{dict}, or the column name of
9		#' weights found in \code{dict}.
10		#' @param pattern.categories A vector of category names corresponding to terms in \code{dict}, or the column name of
11		#' category names found in \code{dict}.
12		#' @param bias A constant to add to each category after weighting and summing. Can be a vector with names
13		#' corresponding to the unique values in \code{dict[, category]}, but is usually extracted from dict based
14		#' on the intercept included in each category (defined by \code{name.map['intname']}).
15		#' @param to.lower Logical indicating whether \code{text} should be converted to lowercase before processing.
16		#' @param return.dtm Logical; if \code{TRUE}, only a document-term matrix will be returned, rather than the
17		#' weighted, summed, and biased category values.
18		#' @param exclusive Logical; if \code{FALSE}, each dictionary term is searched for in the original text.
19		#' Otherwise (by default), terms are sorted by length (with longer terms being searched for first), and
20		#' matches are removed from the text (avoiding subsequent matches to matched patterns).
21		#' @param drop.zeros logical; if \code{TRUE}, categories or terms with no matches will be removed.
22		#' @param boundary A string to add to the beginning and end of each dictionary term. If \code{TRUE},
23		#' \code{boundary} will be set to \code{' '}, avoiding pattern matches within words. By default, dictionary
24		#' terms are left as entered.
25		#' @param fixed Logical; if \code{FALSE}, patterns are treated as regular expressions.
26		#' @param globtoregex Logical; if \code{TRUE}, initial and terminal asterisks are replaced with \code{\\\\b\\\\w*}
27		#' and \code{\\\\w*\\\\b} respectively. This will also set \code{fixed} to \code{FALSE} unless fixed is specified.
28		#' @param name.map A named character vector:
29		#' \itemize{
30		#' \item \strong{\code{intname}}: term identifying category biases within the term list;
31		#' defaults to \code{'_intercept'}
32		#' \item \strong{\code{term}}: name of the column containing terms in \code{dict}; defaults to \code{'term'}
33		#' }
34		#' Missing names are added, so names can be specified positional (e.g., \code{c('_int',} \code{'terms')}),
35		#' or only some can be specified by name (e.g., \code{c(term =} \code{'patterns')}), leaving the rest default.
36		#' @param dir Path to a folder in which to look for \code{dict} if it is the name of a file to be passed to
37		#' \code{\link{read.dic}}.
38		#' @seealso For applying term-based dictionaries (to a document-term matrix) see \code{\link{lma_termcat}()}.
39		#' @family Dictionary functions
40		#' @return A matrix with a row per \code{text} and columns per dictionary category, or (when \code{return.dtm = TRUE})
41		#' a sparse matrix with a row per \code{text} and column per term. Includes a \code{WC} attribute with original
42		#' word counts, and a \code{categories} attribute with row indices associated with each category if
43		#' \code{return.dtm = TRUE}.
44		#' @examples
45		#' # example text
46		#' text <- c(
47		#' paste(
48		#' "Oh, what youth was! What I had and gave away.",
49		#' "What I took and spent and saw. What I lost. And now? Ruin."
50		#' ),
51		#' paste(
52		#' "God, are you so bored?! You just want what's gone from us all?",
53		#' "I miss the you that was too. I love that you."
54		#' ),
55		#' paste(
56		#' "Tomorrow! Tomorrow--nay, even tonight--you wait, as I am about to change.",
57		#' "Soon I will off to revert. Please wait."
58		#' )
59		#' )
60		#'
61		#' # make a document-term matrix with pre-specified terms only
62		#' lma_patcat(text, c("bored?!", "i lo", ". "), return.dtm = TRUE)
63		#'
64		#' # get counts of sets of letter
65		#' lma_patcat(text, list(c("a", "b", "c"), c("d", "e", "f")))
66		#'
67		#' # same thing with regular expressions
68		#' lma_patcat(text, list("[abc]", "[def]"), fixed = FALSE)
69		#'
70		#' # match only words
71		#' lma_patcat(text, list("i"), boundary = TRUE)
72		#'
73		#' # match only words, ignoring punctuation
74		#' lma_patcat(
75		#' text, c("you", "tomorrow", "was"),
76		#' fixed = FALSE,
77		#' boundary = "\\b", return.dtm = TRUE
78		#' )
79		#'
80		#' \dontrun{
81		#'
82		#' # read in the temporal orientation lexicon from the World Well-Being Project
83		#' tempori <- read.csv(paste0(
84		#' "https://raw.githubusercontent.com/wwbp/lexica/master/",
85		#' "temporal_orientation/temporal_orientation_lexicon.csv"
86		#' ))
87		#'
88		#' lma_patcat(text, tempori)
89		#'
90		#' # or use the standardized version
91		#' tempori_std <- read.dic("wwbp_prospection", dir = "~/Dictionaries")
92		#'
93		#' lma_patcat(text, tempori_std)
94		#'
95		#' ## get scores on the same scale by adjusting the standardized values
96		#' tempori_std[, -1] <- tempori_std[, -1] / 100 *
97		#' select.dict("wwbp_prospection")$selected[, "original_max"]
98		#'
99		#' lma_patcat(text, tempori_std)[, unique(tempori$category)]
100		#' }
101		#' @export
102
103		lma_patcat <- function(text, dict = NULL, pattern.weights = "weight", pattern.categories = "category", bias = NULL,
104		to.lower = TRUE, return.dtm = FALSE, drop.zeros = FALSE, exclusive = TRUE, boundary = NULL, fixed = TRUE,
105		globtoregex = FALSE, name.map = c(intname = "_intercept", term = "term"), dir = getOption("lingmatch.dict.dir")) {
106	71x	text_names <- names(text)
107	!	if (is.factor(text)) text <- as.character(text)
108	!	if (!is.character(text)) stop("enter a character vector as the first argument")
109	71x	text <- paste(" ", text, " ")
110	!	if (is.null(names(name.map)) && length(name.map) < 3) names(name.map) <- c("intname", "term")[seq_along(name.map)]
111	71x	wide <- FALSE
112	2x	if (missing(dict) && missing(pattern.weights) && missing(pattern.categories)) dict <- lma_dict()
113	71x	if (is.character(dict) && length(dict) == 1 && (file.exists(dict) \|\| grepl("^[A-Za-z_]{3}", dict)) &&
114	71x	missing(pattern.weights) && missing(pattern.categories)) {
115	1x	if (dir == "") dir <- "~/Dictionaries"
116	1x	if (!any(file.exists(dict)) && any(file.exists(normalizePath(paste0(dir, "/", dict), "/", FALSE)))) {
117	!	dict <- normalizePath(paste0(dir, "/", dict), "/", FALSE)
118		}
119	1x	td <- tryCatch(read.dic(dict), error = function(e) NULL)
120	1x	dict <- if (is.null(td)) list(cat1 = dict) else td
121		}
122	71x	if (!is.null(dim(dict))) {
123	13x	if (is.null(colnames(dict))) {
124	1x	colnames(dict) <- paste0("X", seq_len(ncol(dict)))
125		} else {
126	!	if (!is.data.frame(dict)) dict <- as.data.frame(as.matrix(dict), stringsAsFactors = FALSE)
127	12x	terms <- if (name.map[["term"]] %in% colnames(dict)) colnames(dict) != name.map[["term"]] else !logical(ncol(dict))
128	12x	if (missing(pattern.weights) && !any(pattern.weights %in% colnames(dict))) {
129	6x	if (any(su <- terms & vapply(dict, is.numeric, TRUE))) {
130	6x	terms <- terms & !su
131	6x	pattern.weights <- dict[, su]
132		}
133		}
134	12x	if (missing(pattern.categories) && !pattern.categories %in% colnames(dict)) {
135	6x	if (any(su <- terms & vapply(dict, function(v) !is.numeric(v) && anyDuplicated(v), TRUE))) {
136	1x	terms <- terms & !su
137	1x	pattern.categories <- dict[, su]
138	!	if (sum(su) > 1) pattern.categories <- do.call(paste, pattern.categories)
139		}
140		}
141	12x	if (name.map[["term"]] %in% colnames(dict)) {
142	10x	dict[, name.map[["term"]]]
143	2x	} else if (!all(terms)) {
144	1x	dict <- if (any(terms)) dict[, which(terms)[1]] else rownames(dict)
145		}
146		}
147		}
148		# independently entered wide weights
149	71x	if ((is.null(dict) \|\| is.null(dim(dict))) && (!is.null(ncol(pattern.weights)) \|\| !is.null(ncol(pattern.categories)))) {
150	6x	weights <- if (!is.null(ncol(pattern.weights))) pattern.weights else pattern.categories
151	6x	if (!is.null(rownames(weights)) && any(grepl("[^0-9]", rownames(weights)))) {
152	1x	dict <- rownames(weights)
153	5x	} else if (is.list(dict) && (length(dict) == 1 \|\|
154	5x	(length(dict[[1]]) == nrow(weights) && all(vapply(dict, length, 0) == nrow(weights))))) {
155	1x	dict <- dict[[1]]
156		}
157	!	if (length(dict) != nrow(weights)) stop("dict and wide weights do not align")
158	6x	wide <- TRUE
159	6x	if (!missing(pattern.categories) && is.character(pattern.categories) && any(su <- pattern.categories %in% weights)) {
160	!	weights <- weights[, pattern.categories[su], drop = FALSE]
161		}
162	6x	weights <- weights[, vapply(seq_len(ncol(weights)), function(col) is.numeric(weights[, col]), TRUE), drop = FALSE]
163	!	if (!ncol(weights)) stop("could not identify numeric weights in wide weights")
164	6x	lex <- list(terms = dict, weights = weights, category = colnames(weights))
165		# wide weights in dict
166	65x	} else if (!is.null(dim(dict)) && (
167	65x	(length(pattern.weights) > 1 && is.character(pattern.weights)) \|\|
168	65x	(length(pattern.categories) > 1 &&
169	65x	(length(pattern.categories) != nrow(dict) \|\| all(pattern.categories %in% colnames(dict)))) \|\|
170	65x	(!any(pattern.weights %in% colnames(dict)) && !any(pattern.categories %in% colnames(dict)))
171		)) {
172	8x	if (any(su <- pattern.weights %in% colnames(dict))) {
173	1x	categories <- pattern.weights[su]
174	7x	} else if (any(su <- pattern.categories %in% colnames(dict))) {
175	1x	categories <- pattern.categories
176	6x	} else if (any(su <- vapply(colnames(dict), function(v) is.numeric(dict[, v]), TRUE))) {
177	6x	categories <- colnames(dict)[su]
178		} else {
179	!	stop("could not find weights in dict column names")
180		}
181	8x	wide <- TRUE
182	8x	if (!name.map[["term"]] %in% colnames(dict)) {
183	1x	terms <- colnames(dict)[vapply(colnames(dict), function(v) !is.numeric(dict[, v]), TRUE)]
184	!	if (!length(terms)) stop("could not find terms in dict")
185	1x	name.map[["term"]] <- if (length(terms) > 1) {
186	!	su <- vapply(terms, function(v) !anyDuplicated(dict[, v]), TRUE)
187	!	if (any(su)) terms[which(su)[1]] else terms[1]
188		} else {
189	1x	terms
190		}
191		}
192	8x	lex <- list(term = dict[, name.map[["term"]]], weights = dict[, categories, drop = FALSE], category = categories)
193		# independently entered weights and categories
194	57x	} else if (is.null(dim(dict))) {
195	53x	if (is.null(dict) \|\| (is.numeric(dict) && is.null(names(dict))) \|\| (is.list(dict) && is.numeric(dict[[1]]) &&
196	53x	is.null(names(dict[[1]])))) {
197	!	stop("could not recognize terms in dict")
198		}
199	53x	n <- length(dict)
200	53x	lex <- data.frame(
201	53x	term = if (is.character(dict)) {
202	13x	dict
203	53x	} else if (is.numeric(dict)) {
204	!	names(dict)
205	53x	} else if (is.list(dict) &&
206	53x	is.numeric(dict[[1]])) {
207	4x	unlist(lapply(dict, names), use.names = FALSE)
208		} else {
209	36x	unlist(dict, use.names = FALSE)
210		},
211	53x	category = if (length(pattern.categories) == n) {
212	23x	if (is.list(dict) && !is.null(names(dict))) {
213	3x	names(dict)
214		} else {
215	20x	pattern.categories
216		}
217	53x	} else if (is.list(dict)) {
218	21x	rep(if (!is.null(names(dict))) {
219	21x	names(dict)
220		} else {
221	!	paste0("cat", seq_along(dict))
222	21x	}, vapply(dict, length, 0))
223		} else {
224	9x	"cat1"
225		},
226	53x	weights = if (is.numeric(dict)) {
227	!	unname(dict)
228	53x	} else if (is.numeric(pattern.weights)) {
229	2x	if (!is.null(names(pattern.weights)) && is.character(dict) && all(dict %in% names(pattern.weights))) {
230	!	pattern.weights[dict]
231		} else {
232	2x	pattern.weights
233		}
234	53x	} else if (is.list(dict)) {
235	40x	if (is.numeric(dict[[1]])) {
236	4x	unlist(dict, use.names = FALSE)
237	36x	} else if (is.list(pattern.weights) && is.numeric(pattern.weights[[1]])) {
238	2x	unlist(pattern.weights, use.names = FALSE)
239		} else {
240	34x	1
241		}
242		} else {
243	11x	1
244	53x	}, stringsAsFactors = FALSE
245		)
246		} else {
247	4x	term <- if ("term" %in% names(name.map)) name.map[["term"]] else "term"
248	4x	en <- colnames(dict)
249	4x	if (!term %in% en) {
250	1x	su <- vapply(en, function(v) !is.numeric(dict[, v]), TRUE)
251	1x	if (any(su)) {
252	1x	term <- en[which(su)[1]]
253	1x	if (sum(su) > 1) {
254	1x	su <- su & vapply(en, function(v) !anyDuplicated(dict[, v]), TRUE)
255	!	if (any(su)) term <- en[which(su)[1]]
256		}
257		} else {
258	!	stop("could not recognize terms in dict")
259		}
260		}
261	4x	lex <- data.frame(
262	4x	term = dict[[term]],
263	4x	category = if (length(pattern.categories) == nrow(dict)) {
264	!	pattern.categories
265	4x	} else if (pattern.categories %in% en) dict[[pattern.categories]] else "cat1",
266	4x	weights = if (length(pattern.weights) == nrow(dict)) {
267	!	pattern.weights
268	4x	} else if (all(pattern.weights %in% en)) dict[[pattern.weights]] else 1, stringsAsFactors = FALSE
269		)
270		}
271	2x	if (any(lex$category == "")) lex[lex$category == "", "category"] <- "cat_unnamed"
272	!	if (is.factor(lex$term)) lex$term <- as.character(lex$term)
273	71x	if (globtoregex \|\| !fixed) {
274	17x	lex$term <- to_regex(list(lex$term), TRUE, globtoregex)[[1]]
275	4x	if (missing(fixed)) fixed <- FALSE
276		}
277	71x	if (wide && return.dtm) {
278	1x	wide <- FALSE
279	1x	lex <- data.frame(term = lex$term, category = if (length(lex$category) == 1) lex$category else "all")
280		}
281	71x	if (!return.dtm && is.null(bias)) {
282	!	if (!"intname" %in% names(name.map)) name.map[["intname"]] <- "_intercept"
283	51x	if (any(su <- lex$term == name.map[["intname"]])) {
284	6x	if (wide) {
285	1x	bias <- structure(lex$weights[su, ], names = lex$categories[su])
286	1x	lex$term <- lex$term[!su]
287	1x	lex$weights <- lex$weights[!su, , drop = FALSE]
288		} else {
289	5x	bias <- structure(lex[su, "weights"], names = lex[su, "category"])
290	5x	lex <- lex[!su, ]
291		}
292		}
293		}
294	71x	if (exclusive) {
295	62x	cls <- tryCatch(-nchar(lex$term), error = function(e) NULL)
296	62x	if (is.null(cls)) {
297	!	warning(
298	!	"dict appears to be misencoded, so results may not be as expected;\n",
299	!	'might try reading the dictionary in with encoding = "latin1"'
300		)
301	!	lex$term <- iconv(lex$term, sub = "#")
302	!	cls <- -nchar(lex$term)
303		}
304	62x	if (wide) {
305	13x	o <- order(cls)
306	13x	lex$term <- lex$term[o]
307	13x	lex$weights <- lex$weights[o, ]
308		} else {
309	49x	lex <- lex[order(cls), ]
310		}
311		}
312	71x	lex$category <- factor(lex$category, unique(lex$category))
313	71x	categories <- levels(lex$category)
314	71x	if (length(bias)) {
315	2x	if (is.null(names(bias)) && length(bias) == length(categories)) names(bias) <- categories
316	!	if (any(su <- !categories %in% names(bias))) bias[categories[su]] <- 0
317		} else {
318	61x	bias <- structure(integer(length(categories)), names = categories)
319		}
320	71x	bias <- bias[categories]
321	8x	if (is.logical(boundary) && boundary) boundary <- " "
322	71x	if (missing(to.lower)) {
323	67x	if (any(grepl("[A-Z]", lex$term))) {
324	2x	to.lower <- FALSE
325	1x	if (!any(grepl("[a-z]", lex$term))) text <- toupper(text)
326		}
327		}
328	65x	if (to.lower) text <- tolower(text)
329	71x	st <- proc.time()[[3]]
330	71x	terms <- unique(lex$term)
331	71x	if (!fixed) {
332	17x	ck <- tryCatch(
333	17x	suppressWarnings(grepl(paste0("(?:", paste(terms, collapse = "\|"), ")"), "", perl = TRUE)),
334	17x	error = function(e) NULL
335		)
336	1x	if (is.null(ck)) stop("terms contain invalid regular expressions", call. = FALSE)
337		}
338	70x	op <- pattern_search(
339	70x	text, if (is.character(boundary)) paste0(boundary, terms, boundary) else terms,
340	70x	seq_along(terms) - 1L, fixed, exclusive
341		)
342	70x	colnames(op[[1]]) <- terms
343	70x	if (return.dtm) {
344	16x	attr(op[[1]], "categories") <- lapply(categories, function(cat) {
345	63x	which(colnames(op[[1]]) %in% lex[lex$category == cat, "term"])
346		})
347	16x	names(attr(op[[1]], "categories")) <- categories
348		} else {
349	54x	op[[1]] <- vapply(categories, function(cat) {
350	137x	l <- if (wide) {
351	26x	data.frame(term = lex$term, weights = if (cat %in% colnames(lex$weights)) {
352	26x	lex$weights[, cat]
353		} else {
354	!	lex$weights
355	26x	}, stringsAsFactors = FALSE)
356		} else {
357	111x	lex[lex$category == cat, ]
358		}
359	137x	as.numeric(op[[1]][, l$term, drop = FALSE] %*% l$weights + bias[[cat]])
360	54x	}, numeric(length(text)))
361	54x	if (length(text) == 1) {
362	9x	op[[1]] <- t(op[[1]])
363	9x	rownames(op[[1]]) <- 1
364		}
365		}
366	!	if (length(text_names) == nrow(op[[1]])) rownames(op[[1]]) <- text_names
367	70x	attr(op[[1]], "WC") <- op[[2]]
368	70x	attr(op[[1]], "time") <- c(patcat = proc.time()[[3]] - st)
369	1x	if (drop.zeros) op[[1]] <- op[[1]][, colSums(op[[1]]) != 0, drop = FALSE]
370	70x	op[[1]]
371		}

1		.onLoad <- function(lib, pkg) {
2	!	if (is.null(getOption("lingmatch.lspace.dir"))) options(lingmatch.lspace.dir = "")
3	!	if (is.null(getOption("lingmatch.dict.dir"))) options(lingmatch.dict.dir = "")
4		}
5
6		match_metric <- function(x) {
7	559x	mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson")
8	559x	sel <- if (is.null(x) \|\| (length(x) == 1 && grepl(tolower(substr(x, 1, 1)), "a", fixed = TRUE))) {
9	21x	mets
10	559x	} else if (is.function(x)) {
11	!	stop("only internal metrics are available: ", paste(mets, collapse = ", "), call. = FALSE)
12		} else {
13	538x	if (is.numeric(x)) {
14	1x	mets[x]
15		} else {
16	!	if (is.call(x)) x <- eval(x)
17	537x	su <- grepl("^(?:cor\|r)", x, TRUE)
18	2x	if (any(su)) x[su] <- "pearson"
19	537x	unique(unlist(lapply(substr(x, 1, 3), grep, mets, fixed = TRUE, value = TRUE)))
20		}
21		}
22	559x	list(all = mets, selected = sel, dummy = as.integer(mets %in% sel))
23		}
24
25		to_regex <- function(dict, intext = FALSE, isGlob = TRUE) {
26	31x	lapply(dict, function(l) {
27	34x	l <- gsub("([+])[+]+", "\\\\\\1+", sub("(?<=[^\\\\])\\\\$", "\\\\\\\\", l, perl = TRUE))
28	34x	if (isGlob) {
29	24x	if (any(ck <- grepl("(?:^\|\\s+)\\\|(?<=\\w)\\(?:$\|\\s+)", l, perl = TRUE))) {
30	18x	l[ck] <- gsub("\\", "[^\\\\s]", gsub("([.^$?(){}[-]\|\\])", "\\\\\\1", l[ck], perl = TRUE))
31		}
32		}
33	34x	if (any(ck <- grepl("[[({]", l) + grepl("[})]\|\\]", l) == 1)) {
34	7x	l[ck] <- gsub("([([{}\\])])", "\\\\\\1", l[ck], perl = TRUE)
35		}
36	34x	l
37		})
38		}
39
40		download.resource <- function(
41		type, resource, decompress = TRUE,
42		check.md5 = TRUE, mode = "wb", dir = "", overwrite = FALSE) {
43	4x	if (dir == "") {
44	!	stop(paste0(
45	!	"specify a directory (dir), or set the ", type,
46	!	" directory option\n(e.g., options(lingmatch.", type, ".dir = ",
47	!	'"~/', if (type == "dict") "Dictionaries" else "Latent Semantic Space",
48	!	'"))\nor initialize it with lma_initdirs()'
49	!	), call. = FALSE)
50		}
51	4x	all_resources <- rownames(if (type == "dict") dict_info else lss_info)
52	!	if (length(resource) == 1 && resource == "all") resource <- all_resources
53	4x	if (length(resource) > 1) {
54	1x	return(lapply(structure(resource, names = resource), function(d) {
55	2x	tryCatch(
56	2x	download.resource(
57	2x	type = type, resource = d, decompress = decompress,
58	2x	check.md5 = check.md5, mode = mode, dir = dir
59		),
60	2x	error = function(e) e$message
61		)
62		}))
63		}
64	3x	dir <- normalizePath(dir, "/", FALSE)
65	1x	if (resource == "default") resource <- if (type == "dict") "lusi" else "100k_lsa"
66	3x	name <- grep(paste0("^", sub("\\..*$", "", resource)), all_resources, value = TRUE)
67	3x	if (!length(name)) {
68	1x	name <- grep(
69	1x	paste0("^", substr(resource, 1, 4)), all_resources, TRUE,
70	1x	value = TRUE
71		)
72		}
73	3x	if (!length(name)) {
74	!	stop(
75	!	type, " ", resource, " not recognized; see https://osf.io/",
76	!	if (type == "dict") "y6g5b" else "489he", "/wiki for available resources"
77		)
78		} else {
79	3x	name <- name[1]
80		}
81	3x	urls <- list(
82	3x	info = function(id) paste0("https://api.osf.io/v2/files/", id),
83	3x	dl = function(id) paste0("https://osf.io/download/", id),
84	3x	versions = function(id) paste0("https://osf.io/", id, "/?show=revision")
85		)
86	!	if (!dir.exists(dir)) dir.create(dir, recursive = TRUE)
87	3x	dl <- function(id, ext) {
88	5x	s <- urls$dl(id)
89	5x	o <- unique(normalizePath(paste0(
90	5x	dir, "/", name, c(ext, sub(".bz2", "", ext, fixed = TRUE))
91	5x	), "/", FALSE))
92	5x	if (any(file.exists(o))) {
93	!	if (overwrite) {
94	!	unlink(o)
95		} else {
96	!	return(-1)
97		}
98		}
99	5x	status <- tryCatch(download.file(s, o[[1]], mode = mode), error = function(e) 1)
100	5x	if (!status && check.md5) {
101	5x	fi <- strsplit(readLines(urls$info(id), 1, TRUE, FALSE, "utf-8"), '[:,{}"]+')[[1]]
102	5x	ck <- md5sum(o[[1]])
103	5x	if (fi[which(fi == "md5") + 1] != ck) {
104	!	warning(paste0(
105	!	"MD5 (", ck, ") does not seem to match the one on record;\n",
106	!	"double check and try manually downloading at ", urls$versions(id)
107		))
108		}
109		}
110	!	if (status) warning("failed to download file from ", s, call. = FALSE)
111	5x	status
112		}
113	3x	if (type == "lspace") {
114	2x	status <- dl(lss_info[name, "osf_terms"], "_terms.txt")
115	2x	if (status < 1) status <- dl(lss_info[name, "osf_dat"], ".dat.bz2")
116	2x	if (status < 1 && decompress) {
117	2x	if (Sys.which("bunzip2") == "") {
118	!	warning("could not find path to bunzip2 command for decompression")
119		} else {
120	2x	o <- normalizePath(paste0(dir, "/", name, ".dat.bz2"), "/", FALSE)
121	2x	if (file.exists(o)) {
122	2x	status <- tryCatch(system2("bunzip2", shQuote(path.expand(o))), error = function(e) 1)
123	2x	if (status) {
124	!	warning(
125	!	'failed to decompress; might try this from a system console:\n bunzip2 "', path.expand(o), '"'
126		)
127		}
128		}
129		}
130		}
131	2x	paths <- normalizePath(paste0(
132	2x	dir, "/", name, c(".dat", if (!decompress) ".bz2", "_terms.txt")
133	2x	), "/", FALSE)
134		} else {
135	1x	ext <- if (dict_info[name, "weighted"]) ".csv" else ".dic"
136	1x	status <- dl(dict_info[name, "osf"], ext)
137	1x	paths <- normalizePath(paste0(dir, "/", name, ext), "/", FALSE)
138		}
139	3x	if (status < 1) {
140	3x	message(
141	3x	paste0(name, " ", type, " ", if (!status) "downloaded" else "exists", ":\n "),
142	3x	paste(paths, collapse = "\n ")
143		)
144		}
145	3x	invisible(paths)
146		}

1		#' Process Text
2		#'
3		#' A wrapper to other pre-processing functions, potentially from \code{\link{read.segments}}, to \code{\link{lma_dtm}}
4		#' or \code{\link{lma_patcat}}, to \code{\link{lma_weight}}, then \code{\link{lma_termcat}} or \code{\link{lma_lspace}},
5		#' and optionally including \code{\link{lma_meta}} output.
6		#'
7		#' @param input A vector of text, or path to a text file or folder.
8		#' @param ... arguments to be passed to \code{\link{lma_dtm}}, \code{\link{lma_patcat}}, \code{\link{lma_weight}},
9		#' \code{\link{lma_termcat}}, and/or \code{\link{lma_lspace}}. All arguments must be named.
10		#' @param meta Logical; if \code{FALSE}, metastatistics are not included. Only applies when raw text is available.
11		#' If included, meta categories are added as the last columns, with names starting with "meta_".
12		#' @param coverage Logical; if \code{TRUE} and a dictionary is provided (\code{dict}),
13		#' will calculate the coverage (number of unique term matches) of each dictionary category.
14		#' @return A matrix with texts represented by rows, and features in columns, unless there are multiple rows per output
15		#' (e.g., when a latent semantic space is applied without terms being mapped) in which case only the special output
16		#' is returned (e.g., a matrix with terms as rows and latent dimensions in columns).
17		#' @seealso If you just want to compare texts, see the \code{\link{lingmatch}()} function.
18		#' @examples
19		#' # starting with some texts in a vector
20		#' texts <- c(
21		#' "Firstly, I would like to say, and with all due respect...",
22		#' "Please, proceed. I hope you feel you can speak freely...",
23		#' "Oh, of course, I just hope to be clear, and not cause offense...",
24		#' "Oh, no, don't monitor yourself on my account..."
25		#' )
26		#'
27		#' # by default, term counts and metastatistics are returned
28		#' lma_process(texts)
29		#'
30		#' # add dictionary and percent arguments for standard dictionary-based results
31		#' lma_process(texts, dict = lma_dict(), percent = TRUE)
32		#'
33		#' # add space and weight arguments for standard word-centroid vectors
34		#' lma_process(texts, space = lma_lspace(texts), weight = "tfidf")
35		#' @export
36
37		lma_process <- function(input = NULL, ..., meta = TRUE, coverage = FALSE) {
38	23x	inp <- as.list(substitute(...()))
39	23x	funs <- c("read.segments", "lma_dtm", "lma_weight", "lma_lspace", "lma_termcat", "lma_patcat")
40	23x	allargs <- NULL
41	23x	arg_matches <- list()
42	23x	for (f in funs) {
43	138x	a <- names(as.list(args(f)))
44	138x	a <- a[c(FALSE, a[-1] %in% names(inp))]
45	138x	allargs <- c(allargs, a)
46	138x	arg_matches[[f]] <- inp[a]
47		}
48	23x	dupargs <- unique(allargs[duplicated(allargs)])
49	23x	arg_checks <- vapply(arg_matches, function(l) sum(!names(l) %in% dupargs), 0)
50		# identify input
51	23x	op <- NULL
52	1x	if (is.function(input)) stop("enter a character vector or matrix-like object as input")
53	2x	if (is.null(dim(input)) && is.list(input) && is.character(input[[1]])) input <- unlist(input, use.names = FALSE)
54	22x	if (is.character(input) \|\| is.factor(input)) {
55	12x	ck_paths <- length(input) != 1 && all(file.exists(input))
56	12x	op <- if (length(arg_matches$read.segments) \|\| ck_paths) {
57	1x	an <- names(arg_matches$read.segments)
58	1x	if (!any(grepl("path\|text", an))) arg_matches$read.segments$path <- input
59	1x	do.call(read.segments, lapply(arg_matches$read.segments, eval.parent, 2))
60		} else {
61	11x	data.frame(
62	11x	text = if (length(input) == 1 && file.exists(input)) readLines(input) else input, stringsAsFactors = FALSE
63		)
64		}
65		} else {
66	10x	if (is.null(dim(input))) {
67	3x	if (is.null(names(input)) \|\| (is.list(input) && !all(vapply(input, length, 0) == length(input[[1]])))) {
68	1x	stop("input is not of a recognized format -- should be text or a dtm-like object")
69		}
70	2x	input <- if (is.list(input)) as.data.frame(input, stringsAsFactors = FALSE) else t(input)
71		}
72	9x	op <- input
73		}
74		# process
75	21x	ck_text <- "text" %in% colnames(op) && is.character(op[, "text"])
76	21x	ck_changed <- FALSE
77	21x	if (ck_text) {
78	13x	if (arg_checks[["lma_patcat"]]) {
79	3x	if (!"return.dtm" %in% names(arg_matches$lma_patcat) && (coverage \|\| length(arg_matches$lma_weight))) {
80	3x	arg_matches$lma_patcat$return.dtm <- TRUE
81		}
82	3x	arg_matches$lma_patcat$text <- op[, "text"]
83	3x	x <- do.call(lma_patcat, lapply(arg_matches$lma_patcat, eval.parent, 2))
84	3x	ck_changed <- TRUE
85		} else {
86	10x	arg_matches$lma_dtm$text <- op[, "text"]
87	10x	x <- do.call(lma_dtm, lapply(arg_matches$lma_dtm, eval.parent, 2))
88	10x	ck_changed <- TRUE
89		}
90		} else {
91	8x	x <- op
92		}
93	21x	if (arg_checks[["lma_weight"]]) {
94	10x	arg_matches$lma_weight$dtm <- x
95	10x	x <- do.call(lma_weight, lapply(arg_matches$lma_weight, eval.parent, 2))
96	10x	attr(x, "categories") <- attr(arg_matches$lma_weight$dtm, "categories")
97	10x	ck_changed <- TRUE
98		}
99	21x	if (!is.null(attr(x, "categories"))) {
100	5x	categories <- attr(x, "categories")
101	5x	xc <- as.data.frame(
102	5x	matrix(0, nrow(op), length(categories), dimnames = list(NULL, names(categories))),
103	5x	stringsAsFactors = FALSE
104		)
105	5x	if (coverage) {
106	1x	cxc <- xc
107	1x	for (cat in names(categories)) {
108	3x	su <- x[, categories[[cat]], drop = FALSE]
109	3x	xc[, cat] <- rowSums(su, na.rm = TRUE)
110	3x	cxc[, cat] <- rowSums(su != 0, na.rm = TRUE)
111		}
112	1x	colnames(cxc) <- paste0("coverage_", colnames(xc))
113	1x	xc <- cbind(xc, cxc)
114		} else {
115	4x	for (cat in names(categories)) {
116	48x	xc[, cat] <- rowSums(x[, categories[[cat]], drop = FALSE], na.rm = TRUE)
117		}
118		}
119	5x	x <- xc
120	5x	ck_changed <- TRUE
121	16x	} else if (arg_checks[["lma_termcat"]] \|\| (length(arg_matches$lma_termcat) &&
122	16x	any(names(arg_matches$lma_termcat) != "dir"))) {
123	9x	arg_matches$lma_termcat$coverage <- coverage
124	9x	arg_matches$lma_termcat$dtm <- x
125	9x	x <- do.call(lma_termcat, lapply(arg_matches$lma_termcat, eval.parent, 2))
126	9x	ck_changed <- TRUE
127		}
128	21x	if (arg_checks[["lma_lspace"]]) {
129	2x	nr <- NROW(x)
130	2x	arg_matches$lma_lspace$dtm <- x
131	2x	x <- do.call(lma_lspace, lapply(arg_matches$lma_lspace, eval.parent, 2))
132	2x	if (nrow(x) != nr) {
133	1x	colnames(x) <- paste0("dim", seq_len(ncol(x)))
134	1x	return(x)
135		}
136	1x	ck_changed <- TRUE
137		}
138	5x	if (any(grepl("Matrix", class(x), fixed = TRUE))) x <- as.matrix(x)
139	15x	if (is.matrix(x)) x <- as.data.frame(x, stringsAsFactors = FALSE)
140	20x	op <- if (ck_text && ck_changed) cbind(op, if (is.null(dim(x))) t(x) else x) else x
141	20x	if (ck_text && meta) {
142	10x	opm <- lma_meta(op[, "text"])
143	10x	if (arg_checks[["lma_weight"]] &&
144	10x	(!"normalize" %in% names(arg_matches$lma_weight) \|\| arg_matches$lma_weight$normalize)) {
145	3x	cols <- c(9, 14:23)
146	3x	opm_counts <- opm[, cols]
147	3x	su <- opm_counts != 0
148	3x	adj <- if ("percent" %in% names(arg_matches$lma_weight) && arg_matches$lma_weight$percent) 100 else 1
149	3x	opm_counts[su] <- opm_counts[su] / rep(opm$words, length(cols))[which(su)] * adj
150	3x	opm[, cols] <- opm_counts
151		}
152	10x	colnames(opm) <- paste0("meta_", colnames(opm))
153	10x	op <- cbind(op, opm)
154		}
155	20x	op
156		}

1		#' Calculate Text-Based Metastatistics
2		#'
3		#' Calculate simple descriptive statistics from text.
4		#'
5		#' @param text A character vector of texts.
6		#' @return A data.frame:
7		#' \itemize{
8		#' \item \strong{\code{characters}}: Total number of characters.
9		#' \item \strong{\code{syllables}}: Total number of syllables, as estimated by split length of \cr
10		#' \code{'a+[eu]\|e+a\|i+\|o+[ui]\|u+\|y+[aeiou]'} - 1.
11		#' \item \strong{\code{words}}: Total number of words (raw word count).
12		#' \item \strong{\code{unique_words}}: Number of unique words (binary word count).
13		#' \item \strong{\code{clauses}}: Number of clauses, as marked by commas, colons, semicolons, dashes, or brackets
14		#' within sentences.
15		#' \item \strong{\code{sentences}}: Number of sentences, as marked by periods, question marks, exclamation points,
16		#' or new line characters.
17		#' \item \strong{\code{words_per_clause}}: Average number of words per clause.
18		#' \item \strong{\code{words_per_sentence}}: Average number of words per sentence.
19		#' \item \strong{\code{sixltr}}: Number of words 6 or more characters long.
20		#' \item \strong{\code{characters_per_word}}: Average number of characters per word
21		#' (\code{characters} / \code{words}).
22		#' \item \strong{\code{syllables_per_word}}: Average number of syllables per word
23		#' (\code{syllables} / \code{words}).
24		#' \item \strong{\code{type_token_ratio}}: Ratio of unique to total words: \code{unique_words} / \code{words}.
25		#' \item \strong{\code{reading_grade}}: Flesch-Kincaid grade level: .39 * \code{words} / \code{sentences} +
26		#' 11.8 * \code{syllables} / \code{words} - 15.59.
27		#' \item \strong{\code{numbers}}: Number of terms starting with numbers.
28		#' \item \strong{\code{punct}}: Number of terms starting with non-alphanumeric characters.
29		#' \item \strong{\code{periods}}: Number of periods.
30		#' \item \strong{\code{commas}}: Number of commas.
31		#' \item \strong{\code{qmarks}}: Number of question marks.
32		#' \item \strong{\code{exclams}}: Number of exclamation points.
33		#' \item \strong{\code{quotes}}: Number of quotation marks (single and double).
34		#' \item \strong{\code{apostrophes}}: Number of apostrophes, defined as any modified letter apostrophe, or backtick
35		#' or single straight or curly quote surrounded by letters.
36		#' \item \strong{\code{brackets}}: Number of bracketing characters (including parentheses, and square,
37		#' curly, and angle brackets).
38		#' \item \strong{\code{orgmarks}}: Number of characters used for organization or structuring (including
39		#' dashes, foreword slashes, colons, and semicolons).
40		#' }
41		#' @examples
42		#' text <- c(
43		#' succinct = "It is here.",
44		#' verbose = "Hear me now. I shall tell you about it. It is here. Do you hear?",
45		#' couched = "I might be wrong, but it seems to me that it might be here.",
46		#' bigwords = "Object located thither.",
47		#' excited = "It's there! It's there! It's there!",
48		#' drippy = "It's 'there', right? Not 'here'? 'there'? Are you Sure?",
49		#' struggly = "It's here -- in that place where it is. Like... the 1st place (here)."
50		#' )
51		#' lma_meta(text)
52		#' @export
53
54		lma_meta <- function(text) {
55	11x	text <- gsub("^\\s+\|\\s+$", "", text)
56	11x	dtm <- lma_dtm(text, numbers = TRUE, punct = TRUE, urls = FALSE)
57	11x	text <- gsub(paste0(
58	11x	"((?:^\|\\s)[a-z]+\\.[a-z.]+\|\\d\|(?:^\|\\s)[a-z]\|(?:^\|\\s)[iv]+\|",
59	11x	"ans\|govt\|apt\|etc\|st\|rd\|ft\|feat\|dr\|drs\|mr\|ms\|mrs\|messrs\|jr\|prof)\\."
60	11x	), "", text, TRUE)
61	11x	terms <- colnames(dtm)
62	11x	dwm <- dtm[, grepl("^[a-z']", terms), drop = FALSE]
63	11x	words <- colnames(dwm)
64	11x	word_lengths <- nchar(words)
65	11x	word_syllables <- vapply(strsplit(words, "a+[eu]\|e+a\|i+\|o+[ui]\|u+\|y+[aeiou]"), length, 0) - 1
66	11x	word_syllables[word_syllables == 0] <- 1
67	11x	res <- data.frame(
68	11x	characters = as.numeric(dwm %*% word_lengths),
69	11x	syllables = as.numeric(dwm %*% word_syllables),
70	11x	words = rowSums(dwm),
71	11x	unique_words = rowSums(dwm != 0),
72	11x	clauses = vapply(strsplit(text, '([.?!\n,:;)}>-]\|\\])([.?!\n,:;)}>\n\'"-]\|\\s\|\\])*'), length, 0),
73	11x	sentences = vapply(strsplit(text, '[.?!\n]([.?!\n\'"]\|\\s)*'), length, 0),
74	11x	stringsAsFactors = FALSE
75		)
76	11x	cbind(res, with(res, data.frame(
77	11x	words_per_clause = words / clauses,
78	11x	words_per_sentence = words / sentences,
79	11x	sixltr = as.numeric(dwm %*% (word_lengths > 5)),
80	11x	characters_per_word = characters / words,
81	11x	syllables_per_word = syllables / words,
82	11x	type_token_ratio = unique_words / words,
83	11x	reading_grade = .39 * words / sentences + 11.8 * syllables / words - 15.59,
84	11x	numbers = if (any(su <- grepl("^[0-9]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
85	11x	puncts = if (any(su <- grepl("^[^a-z0-9]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
86	11x	periods = if ("." %in% terms) dtm[, "."] else 0,
87	11x	commas = if ("," %in% terms) dtm[, ","] else 0,
88	11x	qmarks = if ("?" %in% terms) dtm[, "?"] else 0,
89	11x	exclams = if ("!" %in% terms) dtm[, "!"] else 0,
90	11x	quotes = if (any(su <- grepl('^[\'"]', terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
91	11x	apostrophes = vapply(strsplit(text, "[\u02bc]+\|[a-zA-Z][\u0027\u0060\u2019]+[a-zA-Z]"), length, 0) - 1,
92	11x	brackets = if (any(su <- grepl("[(\\)<>{\\}[]\|\\]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
93	11x	orgmarks = if (any(su <- grepl("[/:;-]", terms))) rowSums(dtm[, su, drop = FALSE]) else 0,
94	11x	row.names = seq_len(nrow(res)), stringsAsFactors = FALSE
95		)))
96		}

1		#' Document-Term Matrix Weighting
2		#'
3		#' Weight a document-term matrix.
4		#' @param dtm A matrix with words as column names.
5		#' @param weight A string referring at least partially to one (or a combination; see note) of the
6		#' available weighting methods:
7		#'
8		#' \strong{Term weights} (applied uniquely to each cell)
9		#' \itemize{
10		#' \item \strong{\code{binary}} \cr
11		#' \code{(dtm > 0) * 1} \cr
12		#' Convert frequencies to 1s and 0s; remove differences in frequencies.
13		#'
14		#' \item \strong{\code{log}} \cr
15		#' \code{log(dtm + 1, log.base)} \cr
16		#' Log of frequencies.
17		#'
18		#' \item \strong{\code{sqrt}} \cr
19		#' \code{sqrt(dtm)} \cr
20		#' Square root of frequencies.
21		#'
22		#' \item \strong{\code{count}} \cr
23		#' \code{dtm} \cr
24		#' Unaltered; sometimes called term frequencies (tf).
25		#'
26		#' \item \strong{\code{amplify}} \cr
27		#' \code{dtm ^ alpha} \cr
28		#' Amplify difference in frequencies.
29		#' }
30		#'
31		#' \strong{Document weights} (applied by column)
32		#' \itemize{
33		#' \item \strong{\code{dflog}} \cr
34		#' \code{log(colSums(dtm > 0), log.base)} \cr
35		#' Log of binary term sum.
36		#'
37		#' \item \strong{\code{entropy}} \cr
38		#' \code{1 - rowSums(x *} \code{log(x + 1, log.base) /} \code{log(ncol(x), log.base),} \code{na.rm = TRUE)} \cr
39		#' Where \code{x = t(dtm) / colSums(dtm > 0)}; entropy of term-conditional term distribution.
40		#'
41		#' \item \strong{\code{ppois}} \cr
42		#' \code{1 - ppois(pois.x,} \code{colSums(dtm) / nrow(dtm))} \cr
43		#' Poisson-predicted term distribution.
44		#'
45		#' \item \strong{\code{dpois}} \cr
46		#' \code{1 - dpois(pois.x, colSums(dtm) / nrow(dtm))} \cr
47		#' Poisson-predicted term density.
48		#'
49		#' \item \strong{\code{dfmlog}} \cr
50		#' \code{log(diag(dtm[max.col(t(dtm)), ]), log.base)} \cr
51		#' Log of maximum term frequency.
52		#'
53		#' \item \strong{\code{dfmax}} \cr
54		#' \code{diag(dtm[max.col(t(dtm)), ])} \cr
55		#' Maximum term frequency.
56		#'
57		#' \item \strong{\code{df}} \cr
58		#' \code{colSums(dtm > 0)} \cr
59		#' Sum of binary term occurrence across documents.
60		#'
61		#' \item \strong{\code{idf}} \cr
62		#' \code{log(nrow(dtm) / colSums(dtm > 0), log.base)} \cr
63		#' Inverse document frequency.
64		#'
65		#' \item \strong{\code{ridf}} \cr
66		#' \code{idf - log(dpois, log.base)} \cr
67		#' Residual inverse document frequency.
68		#'
69		#' \item \strong{\code{normal}} \cr
70		#' \code{sqrt(1 / colSums(dtm ^ 2))} \cr
71		#' Normalized document frequency.
72		#' }
73		#'
74		#' Alternatively, \code{'pmi'} or \code{'ppmi'} will apply a pointwise mutual information weighting
75		#' scheme (with \code{'ppmi'} setting negative values to 0).
76		#' @param normalize Logical: if \code{FALSE}, the dtm is not divided by document word-count before
77		#' being weighted.
78		#' @param wc.complete If the dtm was made with \code{\link{lma_dtm}} (has a \code{'WC'}
79		#' attribute), word counts for
80		#' frequencies can be based on the raw count (default; \code{wc.complete = TRUE}). If
81		#' \code{wc.complete = FALSE}, or the dtm does not have a \code{'WC'} attribute,
82		#' \code{rowSums(dtm)} is used as word count.
83		#' @param log.base The base of logs, applied to any weight using \code{\link[base]{log}}.
84		#' Default is 10.
85		#' @param alpha A scaling factor applied to document frequency as part of pointwise mutual
86		#' information weighting, or amplify's power (\code{dtm ^ alpha}, which defaults to 1.1).
87		#' @param pois.x integer; quantile or probability of the poisson distribution (\code{dpois(pois.x,
88		#' colSums(x,} \code{na.rm = TRUE) / nrow(x))}).
89		#' @param doc.only Logical: if \code{TRUE}, only document weights are returned (a single value for
90		#' each term).
91		#' @param percent Logical; if \code{TRUE}, frequencies are multiplied by 100.
92		#' @note
93		#' Term weights works to adjust differences in counts within documents, with differences meaning
94		#' increasingly more from \code{binary} to \code{log} to \code{sqrt} to \code{count} to \code{amplify}.
95		#'
96		#' Document weights work to treat words differently based on their between-document or overall frequency.
97		#' When term frequencies are constant, \code{dpois}, \code{idf}, \code{ridf}, and \code{normal} give
98		#' less common words increasingly more weight, and \code{dfmax}, \code{dfmlog}, \code{ppois}, \code{df},
99		#' \code{dflog}, and \code{entropy} give less common words increasingly less weight.
100		#'
101		#' \code{weight} can either be a vector with two characters, corresponding to term weight and
102		#' document weight (e.g., \code{c('count', 'idf')}), or it can be a string with term and
103		#' document weights separated by any of \code{:\\*_/; ,-} (e.g., \code{'count-idf'}).
104		#' \code{'tf'} is also acceptable for \code{'count'}, and \code{'tfidf'} will be parsed as
105		#' \code{c('count', 'idf')}, though this is a special case.
106		#'
107		#' For \code{weight}, term or document weights can be entered individually; term weights alone will
108		#' not apply any document weight, and document weights alone will apply a \code{'count'} term weight
109		#' (unless \code{doc.only = TRUE}, in which case a term-named vector of document weights is returned
110		#' instead of a weighted dtm).
111		#' @return A weighted version of \code{dtm}, with a \code{type} attribute added (\code{attr(dtm, 'type')}).
112		#' @examples
113		#' # visualize term and document weights
114		#'
115		#' ## term weights
116		#' term_weights <- c("binary", "log", "sqrt", "count", "amplify")
117		#' Weighted <- sapply(term_weights, function(w) lma_weight(1:20, w, FALSE))
118		#' if (require(splot)) splot(Weighted ~ 1:20, labx = "Raw Count", lines = "co")
119		#'
120		#' ## document weights
121		#' doc_weights <- c(
122		#' "df", "dflog", "dfmax", "dfmlog", "idf", "ridf",
123		#' "normal", "dpois", "ppois", "entropy"
124		#' )
125		#' weight_range <- function(w, value = 1) {
126		#' m <- diag(20)
127		#' m[upper.tri(m, TRUE)] <- if (is.numeric(value)) {
128		#' value
129		#' } else {
130		#' unlist(lapply(
131		#' 1:20, function(v) rep(if (value == "inverted") 21 - v else v, v)
132		#' ))
133		#' }
134		#' lma_weight(m, w, FALSE, doc.only = TRUE)
135		#' }
136		#'
137		#' if (require(splot)) {
138		#' category <- rep(c("df", "idf", "normal", "poisson", "entropy"), c(4, 2, 1, 2, 1))
139		#' op <- list(
140		#' laby = "Relative (Scaled) Weight", labx = "Document Frequency",
141		#' leg = "outside", lines = "connected", mv.scale = TRUE, note = FALSE
142		#' )
143		#' splot(
144		#' sapply(doc_weights, weight_range) ~ 1:20,
145		#' options = op, title = "Same Term, Varying Document Frequencies",
146		#' sud = "All term frequencies are 1.",
147		#' colorby = list(category, grade = TRUE)
148		#' )
149		#' splot(
150		#' sapply(doc_weights, weight_range, value = "sequence") ~ 1:20,
151		#' options = op, title = "Term as Document Frequencies",
152		#' sud = "Non-zero terms are the number of non-zero terms.",
153		#' colorby = list(category, grade = TRUE)
154		#' )
155		#' splot(
156		#' sapply(doc_weights, weight_range, value = "inverted") ~ 1:20,
157		#' options = op, title = "Term Opposite of Document Frequencies",
158		#' sud = "Non-zero terms are the number of zero terms + 1.",
159		#' colorby = list(category, grade = TRUE)
160		#' )
161		#' }
162		#'
163		#' @export
164
165		lma_weight <- function(dtm, weight = "count", normalize = TRUE, wc.complete = TRUE,
166		log.base = 10, alpha = 1, pois.x = 1L, doc.only = FALSE, percent = FALSE) {
167	!	if (is.null(dim(dtm))) dtm <- if (is.character(dtm) \|\| is.factor(dtm)) lma_dtm(dtm) else matrix(dtm, 1)
168	72x	ck <- attr(dtm, "type")
169	72x	if (!is.null(ck) && length(ck) == 3 && (ck[1] == "TRUE" \|\| ck[2] != "count" \|\| ck[3] != "NA")) {
170	!	message(
171	!	"the entered dtm appears to already be weighted (", paste(ck[2:3], collapse = "*"),
172	!	"), so it will not be altered"
173		)
174	!	return(dtm)
175		}
176	72x	weight <- tolower(weight)
177	3x	if (missing(normalize) && any(grepl("pmi", weight))) normalize <- FALSE
178	72x	if (normalize) {
179	45x	wc <- attr(dtm, "WC")
180	11x	if (is.null(wc) \|\| !wc.complete \|\| nrow(dtm) != length(wc)) wc <- rowSums(dtm, na.rm = TRUE)
181	45x	adj <- if (percent) 100 else 1
182	45x	if (.hasSlot(dtm, "x") && .hasSlot(dtm, "i")) {
183	40x	wc <- wc[dtm@i + 1]
184	40x	su <- wc != 0
185	40x	dtm@x[su] <- dtm@x[su] / wc[su] * adj
186		} else {
187	5x	su <- wc != 0
188	5x	dtm[su, ] <- dtm[su, ] / wc[su] * adj
189		}
190		}
191	72x	nr <- nrow(dtm)
192	72x	if (any(grepl("pmi", weight))) {
193	3x	tw <- dw <- "pmi"
194	3x	if (missing(log.base)) log.base <- 2
195	3x	twc <- sum(dtm, na.rm = TRUE)
196	3x	pc <- matrix(colSums(dtm^alpha, na.rm = TRUE) / twc^alpha, 1)
197	3x	dtm <- dtm / twc
198	3x	dtm <- dtm / rowSums(dtm, na.rm = TRUE) %*% pc
199	3x	if (.hasSlot(dtm, "x")) {
200	1x	dtm@x <- log(dtm@x, base = log.base)
201	1x	dtm@x[!is.finite(dtm@x)] <- 0
202		} else {
203	2x	dtm <- log(dtm, base = log.base)
204	2x	dtm[!is.finite(dtm)] <- 0
205		}
206	3x	if (any(grepl("pp", weight))) {
207	1x	tw <- dw <- "ppmi"
208	1x	dtm[dtm < 0] <- 0
209		}
210		} else {
211	69x	term <- function(x, type) {
212	56x	switch(type,
213	1x	binary = (x > 0) * 1,
214	1x	log = log(x + 1, base = log.base),
215	1x	sqrt = sqrt(x),
216	52x	count = x,
217	1x	amplify = x^alpha
218		)
219		}
220	69x	doc <- function(x, type) {
221	27x	d <- switch(type,
222	27x	df = colSums(x > 0, na.rm = TRUE),
223	27x	dflog = log(colSums(x > 0, na.rm = TRUE), base = log.base),
224	27x	dfmax = diag(x[max.col(t(x)), ]),
225	27x	dfmlog = log(diag(x[max.col(t(x)), ]), base = log.base),
226	27x	idf = log(nrow(x) / colSums(x > 0, na.rm = TRUE), base = log.base),
227	27x	normal = sqrt(1 / colSums(x^2, na.rm = TRUE)),
228	27x	dpois = 1 - dpois(pois.x, colSums(x, na.rm = TRUE) / nrow(x)),
229	27x	ppois = 1 - ppois(pois.x, colSums(x, na.rm = TRUE) / nrow(x)),
230	27x	ridf = doc(x, "idf") - log(doc(x, "dpois"), base = log.base),
231	27x	entropy = {
232	1x	x <- t(x) / colSums(x > 0, na.rm = TRUE)
233	1x	1 - rowSums(x * log(x + 1, base = log.base) /
234	1x	log(ncol(x), base = log.base), na.rm = TRUE)
235		}
236		)
237	27x	d[!is.finite(d)] <- 0
238	27x	d
239		}
240	69x	if (length(weight) == 1) {
241	69x	weight <- strsplit(weight, " [:\\_/; ,-] *")[[1]]
242	9x	if (length(weight) == 1 && weight == "tfidf") weight <- c("count", "idf")
243		}
244	1x	if (grepl("^(?:t\|na\|non\|f)", weight[1])) weight[1] <- "count"
245	69x	tws <- c("binary", "log", "sqrt", "count", "amplify")
246	69x	tw <- if (weight[1] == "") "count" else grep(substr(weight[1], 0, 4), tws, value = TRUE)[1]
247	69x	pdw <- TRUE
248	69x	dws <- c("df", "dflog", "dfmax", "dfmlog", "idf", "normal", "dpois", "ppois", "ridf", "entropy")
249	69x	if (is.na(tw)) {
250	15x	tw <- grep(substr(weight[1], 0, 4), dws, value = TRUE)[1]
251	15x	if (!is.na(tw)) {
252	15x	pdw <- FALSE
253	15x	if (!doc.only) {
254	2x	dw <- tw
255	2x	tw <- "count"
256		} else {
257	13x	return(doc(dtm, tw))
258		}
259		} else {
260	!	stop(paste(weight, collapse = " * "), " is not a recognized weight", call. = FALSE)
261		}
262		}
263	54x	if (pdw) dw <- if (length(weight) > 1) grep(substr(weight[2], 0, 4), dws, value = TRUE)[1] else "none"
264	!	if (is.na(dw)) dw <- "none"
265	1x	if (missing(alpha) && tw == "amplify") alpha <- 1.1
266	56x	dtm <- if (dw == "none") term(dtm, tw) else term(dtm, tw) * rep(doc(dtm, dw), each = nr)
267		}
268	59x	attr(dtm, "type") <- c(normalized = normalize, term = tw, document = dw)
269	59x	dtm
270		}

1		#' Latent Semantic Space (Embeddings) Operations
2		#'
3		#' Map a document-term matrix onto a latent semantic space, extract terms from a
4		#' latent semantic space (if \code{dtm} is a character vector, or \code{map.space =} \code{FALSE}),
5		#' or perform a singular value decomposition of a document-term matrix (if \code{dtm} is a matrix
6		#' and \code{space} is missing).
7		#' @param dtm A matrix with terms as column names, or a character vector of terms to be extracted
8		#' from a specified space. If this is of length 1 and \code{space} is missing, it will be treated
9		#' as \code{space}.
10		#' @param space A matrix with terms as rownames. If missing, this will be the right singular vectors
11		#' of a singular value decomposition of \code{dtm}. If a character, a file matching the character
12		#' will be searched for in \code{dir} (e.g., \code{space = 'google'}). If a file is not found and
13		#' the character matches one of the \href{https://osf.io/489he/wiki/home}{available spaces}, you
14		#' will be given the option to download it, as handled by \code{\link{download.lspace}}.
15		#' If \code{dtm} is missing, the entire space will be loaded and returned.
16		#' @param map.space Logical: if \code{FALSE}, the original vectors of \code{space} for terms
17		#' found in \code{dtm} are returned. Otherwise \code{dtm} \code{\%*\%} \code{space} is returned,
18		#' excluding uncommon columns of \code{dtm} and rows of \code{space}.
19		#' @param fill.missing Logical: if \code{TRUE} and terms are being extracted from a space, includes
20		#' terms not found in the space as rows of 0s, such that the returned matrix will have a row
21		#' for every requested term.
22		#' @param term.map A matrix with \code{space} as a column name, terms as row names, and indices of
23		#' the terms in the given space as values, or a numeric vector of indices with terms as names, or
24		#' a character vector of terms corresponding to rows of the space. This is used instead of reading
25		#' in an "_terms.txt" file corresponding to a \code{space} entered as a character (the name of a
26		#' space file).
27		#' @param dim.cutoff If a \code{space} is calculated, this will be used to decide on the number of
28		#' dimensions to be retained: \code{cumsum(d) / sum(d) < dim.cutoff}, where \code{d} is a vector
29		#' of singular values of \code{dtm} (i.e., \code{svd(dtm)$d}). The default is \code{.5}; lower
30		#' cutoffs result in fewer dimensions.
31		#' @param keep.dim Logical: if \code{TRUE}, and a space is being calculated from the input, a matrix
32		#' in the same dimensions as \code{dtm} is returned. Otherwise, a matrix with terms as rows and
33		#' dimensions as columns is returned.
34		#' @param use.scan Logical: if \code{TRUE}, reads in the rows of \code{space} with \code{\link{scan}}.
35		#' @param dir Path to a folder containing spaces. \cr
36		#' Set a session default with \code{options(lingmatch.lspace.dir = 'desired/path')}.
37		#' @note
38		#' A traditional latent semantic space is a selection of right singular vectors from the singular
39		#' value decomposition of a dtm (\code{svd(dtm)$v[, 1:k]}, where \code{k} is the selected number of
40		#' dimensions, decided here by \code{dim.cutoff}).
41		#'
42		#' Mapping a new dtm into a latent semantic space consists of multiplying common terms:
43		#' \code{dtm[, ct]} \code{\%*\% space[ct, ]}, where \code{ct} \code{=} \code{colnames(dtm)[colnames(dtm)}
44		#' \code{\%in\%} \code{rownames(space)]} -- the terms common between the dtm and the space. This
45		#' results in a matrix with documents as rows, and dimensions as columns, replacing terms.
46		#' @family Latent Semantic Space functions
47		#' @return A matrix or sparse matrix with either (a) a row per term and column per latent dimension (a latent
48		#' space, either calculated from the input, or retrieved when \code{map.space = FALSE}), (b) a row per document
49		#' and column per latent dimension (when a dtm is mapped to a space), or (c) a row per document and
50		#' column per term (when a space is calculated and \code{keep.dim = TRUE}).
51		#' @examples
52		#' text <- c(
53		#' paste(
54		#' "Hey, I like kittens. I think all kinds of cats really are just the",
55		#' "best pet ever."
56		#' ),
57		#' paste(
58		#' "Oh year? Well I really like cars. All the wheels and the turbos...",
59		#' "I think that's the best ever."
60		#' ),
61		#' paste(
62		#' "You know what? Poo on you. Cats, dogs, rabbits -- you know, living",
63		#' "creatures... to think you'd care about anything else!"
64		#' ),
65		#' paste(
66		#' "You can stick to your opinion. You can be wrong if you want. You know",
67		#' "what life's about? Supercharging, diesel guzzling, exhaust spewing,",
68		#' "piston moving ignitions."
69		#' )
70		#' )
71		#'
72		#' dtm <- lma_dtm(text)
73		#'
74		#' # calculate a latent semantic space from the example text
75		#' lss <- lma_lspace(dtm)
76		#'
77		#' # show that document similarities between the truncated and full space are the same
78		#' spaces <- list(
79		#' full = lma_lspace(dtm, keep.dim = TRUE),
80		#' truncated = lma_lspace(dtm, lss)
81		#' )
82		#' sapply(spaces, lma_simets, metric = "cosine")
83		#'
84		#' \dontrun{
85		#'
86		#' # specify a directory containing spaces,
87		#' # or where you would like to download spaces
88		#' space_dir <- "~/Latent Semantic Spaces"
89		#'
90		#' # map to a pretrained space
91		#' ddm <- lma_lspace(dtm, "100k", dir = space_dir)
92		#'
93		#' # load the matching subset of the space
94		#' # without mapping
95		#' lss_100k_part <- lma_lspace(colnames(dtm), "100k", dir = space_dir)
96		#'
97		#' ## or
98		#' lss_100k_part <- lma_lspace(dtm, "100k", map.space = FALSE, dir = space_dir)
99		#'
100		#' # load the full space
101		#' lss_100k <- lma_lspace("100k", dir = space_dir)
102		#'
103		#' ## or
104		#' lss_100k <- lma_lspace(space = "100k", dir = space_dir)
105		#' }
106		#' @export
107
108		lma_lspace <- function(dtm = "", space, map.space = TRUE, fill.missing = FALSE, term.map = NULL,
109		dim.cutoff = .5, keep.dim = FALSE, use.scan = FALSE, dir = getOption("lingmatch.lspace.dir")) {
110	45x	if (ckd <- dir == "") dir <- "~/Latent Semantic Spaces"
111	45x	if ((is.character(dtm) \|\| is.factor(dtm)) && missing(space)) {
112	2x	if (length(dtm) > 1 && any(grepl(" ", dtm, fixed = TRUE))) {
113	!	dtm <- lma_dtm(dtm)
114	2x	} else if (length(dtm) == 1 && dtm != "") {
115	2x	if (missing(use.scan)) use.scan <- TRUE
116	2x	space <- dtm
117	2x	dtm <- ""
118		}
119		}
120	4x	if (is.data.frame(dtm)) dtm <- as.matrix(dtm)
121	45x	if (missing(space)) {
122	11x	nr <- nrow(dtm)
123	!	if (is.null(nr)) stop("enter a matrix for dtm, or specify a space")
124	11x	s <- svd(dtm)
125	11x	s$v <- t(s$v)
126	11x	k <- cumsum(s$d) / sum(s$d)
127	!	if (dim.cutoff > 1) dim.cutoff <- 1
128	11x	k <- if (length(k) == 1) 1 else seq_len(if (any(k < dim.cutoff)) which(k >= dim.cutoff)[1] else 1)
129	11x	if (keep.dim) {
130	4x	dtm[] <- s$u[, k] %% (if (length(k) == 1) matrix(s$d[k]) else diag(s$d[k])) %% s$v[k, ]
131		} else {
132	7x	cn <- colnames(dtm)
133	7x	dtm <- t(s$v[k, , drop = FALSE])
134	7x	rownames(dtm) <- cn
135		}
136		} else {
137	34x	terms <- if (is.null(colnames(dtm))) {
138	20x	map.space <- FALSE
139	20x	dtm
140		} else {
141	14x	colnames(dtm)
142		}
143	34x	if (is.character(space)) {
144	!	if (space == "default") space <- "100k_lsa"
145	22x	name <- gsub("^.[/\\]\|\\.[^/\\]$", "", space)[1]
146	22x	spaces <- list.files(dir)
147	22x	ts <- grep(space, spaces, fixed = TRUE, value = TRUE)
148	22x	if (!length(ts)) {
149	!	ts <- rownames(select.lspace(name)$selected)[1]
150	!	if (!ckd && !is.na(ts) && grepl("^$\|^[yt1]\|^ent", readline(paste0(
151	!	"would you like to download the ", ts, " space? (press Enter for yes): "
152		)))) {
153	!	download.lspace(ts, dir = dir)
154	!	ts <- paste0(ts, ".dat")
155		} else {
156	!	stop("space (", space, ") not found in dir (", dir, ")",
157	!	if (ckd) '\nspecify a directory (e.g., dir = "~") to locate or download; see ?download.lspace',
158	!	call. = FALSE
159		)
160		}
161		}
162	22x	space_path <- normalizePath(paste0(dir, "/", if (length(su <- grep("\\.dat$", ts))) {
163	22x	ts[su[[1]]]
164		} else {
165	!	use.scan <- TRUE
166	!	ts[grep("[bgx]z[ip2]*$", ts)[[1]]]
167	22x	}), "/", FALSE)
168	22x	name <- sub("\\.[^.]*$", "", basename(space_path))
169	1x	if (name %in% colnames(term.map)) term.map <- term.map[term.map[, name] != 0, name]
170	22x	rex <- function(inds, f) {
171	4x	nc <- length(strsplit(readLines(f, 1), "\\s+")[[1]])
172	4x	l <- length(inds)
173	4x	all <- all(seq_len(l) == inds)
174	4x	r <- matrix(0, l, nc)
175	4x	i <- 1
176	4x	con <- file(f, "r")
177	4x	on.exit(close(con))
178	4x	while (i <= l) {
179	821x	if (all) {
180	2x	n <- l
181		} else {
182	819x	n <- 1
183	819x	while (i + n < l && inds[i + n - 1] == inds[i + n] - 1) n <- n + 1
184		}
185	821x	r[seq_len(n) + i - 1, ] <- matrix(scan(
186	821x	con,
187	821x	n = n * nc, quiet = TRUE, skip = (if (i == 1) {
188	4x	inds[i]
189		} else {
190	817x	inds[i] - inds[i - 1]
191	821x	}) - 1, quote = "", comment.char = "", na.strings = ""
192	821x	), n, nc, byrow = TRUE)
193	821x	i <- i + n
194		}
195	4x	r
196		}
197	22x	if (is.null(term.map)) {
198	21x	terms_file <- paste0(dir, "/", name, "_terms.txt")
199	!	if (!file.exists(terms_file)) stop("terms file (", space, "_terms.txt) not found in dir (", dir, ").")
200	21x	space_terms <- readLines(terms_file)
201	21x	su <- if (length(terms) == 1 && terms == "") {
202	2x	terms <- space_terms
203	2x	!logical(length(space_terms))
204		} else {
205	19x	space_terms %in% terms
206		}
207	21x	if (sum(su) < length(terms)) {
208	8x	lsterms <- tolower(space_terms)
209	8x	su2 <- !duplicated(lsterms) & lsterms %in% terms[!terms %in% space_terms[su]]
210	8x	if (any(su2)) {
211	!	space_terms[su2] <- lsterms[su2]
212	!	su <- su \| su2
213		}
214		}
215	21x	if (sum(su)) {
216	21x	space <- if (use.scan) rex(which(su), space_path) else t(extract_indices(which(su), space_path))
217	21x	rownames(space) <- space_terms[su]
218	21x	ts <- terms[terms %in% rownames(space)]
219	21x	space <- space[ts, , drop = FALSE]
220		} else {
221	!	stop("no matching terms in space ", space)
222		}
223		} else {
224	!	if (is.character(term.map)) term.map <- structure(seq_along(term.map), names = term.map)
225	1x	su <- which(names(term.map) %in% terms)
226	1x	inds <- as.numeric(sort(if (length(terms) == 1 && terms == "") term.map else term.map[su]))
227	1x	if (length(inds)) {
228	1x	space <- if (use.scan) rex(inds, space_path) else t(extract_indices(inds, space_path))
229	1x	rownames(space) <- ts <- names(term.map)[inds]
230		} else {
231	!	stop("no matching terms in space ", space)
232		}
233		}
234		} else {
235	!	if (is.data.frame(space)) space <- as.matrix(space)
236	12x	name <- deparse(substitute(space))
237	12x	su <- terms %in% rownames(space)
238	12x	if (sum(su)) {
239	12x	ts <- terms[su]
240	12x	space <- space[ts, , drop = FALSE]
241	!	} else if (sum(su <- terms %in% colnames(space))) {
242	!	ts <- terms[su]
243	!	space <- t(space[, ts, drop = FALSE])
244		} else {
245	!	stop("no matching terms in provided space")
246		}
247		}
248	34x	if (fill.missing) {
249	3x	su <- which(!terms %in% rownames(space))
250	3x	if (length(su)) {
251	3x	space <- rbind(space, matrix(0, length(su), ncol(space), dimnames = list(terms[su])))
252	3x	space <- space[terms, ]
253		}
254	3x	ts <- rownames(space)
255		}
256	34x	attr(space, "space") <- name
257	34x	if (map.space) {
258	14x	rep <- length(ts) / ncol(dtm)
259	14x	if (rep < .2) {
260	!	warning(paste0(
261	!	"only ", round(rep * 100, 2), "% of dtm terms appear in the provided space; ",
262	!	"you might consider using a different source or cleaning/partial matching terms"
263	!	), call. = FALSE)
264		}
265	14x	dtm <- dtm[, ts, drop = FALSE] %*% space
266	14x	attr(dtm, "space") <- name
267		} else {
268	20x	return(space)
269		}
270		}
271	25x	dtm
272		}

1		#' Assess Dictionary Categories Within a Latent Semantic Space
2		#'
3		#'
4		#' @param dict A vector of terms, list of such vectors, or a matrix-like object to be
5		#' categorized by \code{\link{read.dic}}.
6		#' @param space A vector space used to calculate similarities between terms.
7		#' Names of spaces (see \code{\link{select.lspace}}), a matrix with terms as row names, or
8		#' \code{"auto"} to auto-select a space based on matched terms. This can also be \code{multi}
9		#' to use multiple spaces, which are combined after similarities are calculated.
10		#' @param n_spaces Number of spaces to draw from if \code{space} is \code{multi}.
11		#' @param suggest Logical; if \code{TRUE}, will search for other terms for possible inclusion
12		#' in \code{space}.
13		#' @param suggestion_terms Number of terms to use when selecting suggested additions.
14		#' @param suggest_stopwords Logical; if \code{TRUE}, will suggest function words.
15		#' @param suggest_discriminate Logical; if \code{TRUE}, will adjust for similarity to other
16		#' categories when finding suggestions.
17		#' @param expand_cutoff_freq Proportion of mapped terms to include when expanding dictionary terms.
18		#' Applies when \code{space} is a character (referring to a space to be loaded).
19		#' @param expand_cutoff_spaces Number of spaces in which a term has to appear to be considered
20		#' for expansion. Applies when \code{space} is a character (referring to a space to be loaded).
21		#' @param dimension_prop Proportion of dimensions to use when searching for suggested additions,
22		#' where less than 1 will calculate similarities to the category core using fewer dimensions
23		#' of the space.
24		#' @param pairwise Logical; if \code{FALSE}, will compare candidate suggestion terms with a single,
25		#' averaged category vector rather than all category terms separately.
26		#' @param glob Logical; if \code{TRUE}, converts globs (asterisk wildcards) to regular expressions.
27		#' @param space_dir Directory from which \code{space} should be loaded.
28		#' @param verbose Logical; if \code{FALSE}, will not show status messages.
29		#' @family Dictionary functions
30		#' @seealso
31		#' To just expand fuzzy terms, see \code{\link{report_term_matches}()}.
32		#'
33		#' Similar information is provided in the \href{https://miserman.github.io/dictionary_builder/}{dictionary builder} web tool.
34		#' @returns A list:
35		#' \itemize{
36		#' \item \strong{\code{expanded}}: A version of \code{dict} with fuzzy terms expanded.
37		#' \item \strong{\code{summary}}: A summary of each dictionary category.
38		#' \item \strong{\code{terms}}: Match (expanded term) similarities within terms and categories.
39		#' \item \strong{\code{suggested}}: If \code{suggest} is \code{TRUE}, a list with suggested
40		#' additions for each dictionary category. Each entry is a named numeric vector with
41		#' similarities for each suggested term.
42		#' }
43		#' @examples
44		#' if (dir.exists("~/Latent Semantic Spaces")) {
45		#' dict <- list(
46		#' furniture = c("table", "chair", "desk", "couch", "sofa*"),
47		#' well_adjusted = c("happy", "bright", "friend", "she", "he", "they")
48		#' )
49		#' dictionary_meta(dict, space_dir = "~/Latent Semantic Spaces")
50		#' }
51		#' @export
52
53		dictionary_meta <- function(
54		dict, space = "auto", n_spaces = 5, suggest = FALSE, suggestion_terms = 10, suggest_stopwords = FALSE,
55		suggest_discriminate = TRUE, expand_cutoff_freq = .98, expand_cutoff_spaces = 10,
56		dimension_prop = 1, pairwise = TRUE, glob = TRUE, space_dir = getOption("lingmatch.lspace.dir"), verbose = TRUE) {
57	!	if (missing(dict)) stop("dict must be specified", call. = FALSE)
58	2x	if (!is.list(dict)) dict <- list(dict)
59	2x	if (is.null(names(dict))) names(dict) <- paste0("cat", seq_along(dict))
60	6x	st <- proc.time()[[3]]
61	6x	if (verbose) cat("preparing terms (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
62	6x	terms <- data.frame(
63	6x	category = rep(names(dict), vapply(dict, length, 0)), term = unlist(dict), stringsAsFactors = FALSE
64		)
65	6x	rownames(terms) <- NULL
66	6x	terms$regex <- paste0("\\b", to_regex(list(terms$term), TRUE, glob)[[1]], "\\b")
67	6x	if (is.character(space)) {
68	3x	term_map <- select.lspace(dir = space_dir, get.map = TRUE)$term_map
69	3x	if (is.null(term_map)) {
70	!	stop(
71	!	"term map not found; specify `space_dir` or provide text",
72	!	call. = FALSE
73		)
74		}
75	3x	if (expand_cutoff_freq > 0 && expand_cutoff_freq < 1) {
76	3x	term_map <- term_map[seq(1, ceiling(nrow(term_map) * expand_cutoff_freq)), , drop = FALSE]
77		}
78	3x	if (expand_cutoff_spaces > 0 && expand_cutoff_spaces < ncol(term_map)) {
79	3x	term_map <- term_map[rowSums(term_map != 0) >= expand_cutoff_spaces, , drop = FALSE]
80		}
81	3x	if (!grepl("^(?:auto\|multi)", space[[1]], TRUE)) {
82	1x	space <- space[space %in% colnames(term_map)]
83	!	if (!length(space)) stop("`space` not found in `term_map` colnames", call. = FALSE)
84	1x	term_map <- term_map[rowSums(term_map[, space, drop = FALSE] != 0) != 0, , drop = FALSE]
85		}
86	3x	space_terms <- rownames(term_map)
87		} else {
88	3x	space_terms <- rownames(space)
89	3x	if (is.null(space_terms)) {
90	!	stop(
91	!	"for space, enter a name or matrix-like object with terms as rownames",
92	!	call. = FALSE
93		)
94		}
95		}
96	6x	if (verbose) cat("expanding terms (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
97	6x	matches <- extract_matches(terms$regex, paste(space_terms, collapse = " "), TRUE)
98	6x	matched_terms <- unique(unlist(lapply(matches, names), use.names = FALSE))
99	1x	if (!length(matched_terms)) stop("no `dict` terms matched any space terms", call. = FALSE)
100	5x	multi <- FALSE
101	5x	if (is.character(space)) {
102	!	if (length(space) == 1 && !missing(n_spaces) && n_spaces > 1) space <- "multi"
103	3x	multi <- grepl("^multi", space[[1]], TRUE)
104	3x	if (length(space) > 1 \|\| multi) {
105	2x	if (length(space) == 1 && multi) {
106	1x	term_map_matched <- term_map[rownames(term_map) %in% matched_terms, , drop = FALSE]
107	1x	commonness <- sort(-rowSums(term_map_matched != 0))
108	1x	common_terms <- names(commonness[commonness >= min(max(commonness), n_spaces)])
109	1x	space <- names(sort(-colSums(term_map_matched[common_terms, , drop = FALSE])))[
110	1x	seq_len(max(1, min(nrow(term_map), n_spaces)))
111		]
112		}
113	2x	multi <- TRUE
114	2x	term_map <- term_map[, space, drop = FALSE]
115	2x	space_terms <- rownames(term_map)[rowSums(term_map != 0) == length(space)]
116	2x	space_name <- paste(space, collapse = ", ")
117	2x	if (verbose) cat("loading spaces (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
118	2x	space <- lapply(space, function(s) {
119	7x	lma_lspace(
120	7x	if (suggest) space_terms else matched_terms[matched_terms %in% space_terms], s,
121	7x	dir = space_dir
122		)
123		})
124	2x	matches <- lapply(matches, function(l) l[names(l) %in% space_terms])
125	2x	matched_terms <- unique(unlist(lapply(matches, names), use.names = FALSE))
126		} else {
127	1x	if (space == "auto") {
128	1x	space <- colnames(term_map)[
129	1x	which.max(colSums(term_map[rownames(term_map) %in% matched_terms, ] != 0))
130		]
131		}
132	1x	space_name <- space
133	1x	space_terms <- rownames(term_map)[term_map[, space] != 0]
134	1x	if (verbose) cat("loading space (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
135	1x	space <- lma_lspace(
136	1x	if (suggest) space_terms else matched_terms[matched_terms %in% space_terms], space,
137	1x	dir = space_dir
138		)
139	1x	matches <- lapply(matches, function(l) l[names(l) %in% space_terms])
140	1x	matched_terms <- unique(unlist(lapply(matches, names), use.names = FALSE))
141		}
142		} else {
143	!	if (is.data.frame(space)) space <- as.matrix(space)
144	2x	space_name <- "custom"
145		}
146	5x	cat_names <- structure(names(dict), names = names(dict))
147	5x	dict_exp <- lapply(cat_names, function(cat) {
148	17x	unique(names(unlist(matches[terms$category == cat])))
149		})
150	5x	if (multi) {
151	1x	if (!suggest) space_terms <- rownames(space[[1]])
152	2x	if (verbose) cat("calculating term similarities (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
153	2x	sims <- lapply(cat_names, function(cat) {
154	8x	su <- space_terms %in% dict_exp[[cat]]
155	8x	if (any(su)) {
156	6x	aggsim <- NULL
157	6x	for (i in seq_along(space)) {
158	21x	s <- space[[i]]
159	21x	if (dimension_prop < 1) {
160	15x	loadings <- colMeans(s[su, , drop = FALSE])
161	15x	dsu <- order(-loadings)[seq(1, max(1, ceiling(ncol(s) * dimension_prop)))]
162	15x	s <- s[, dsu, drop = FALSE]
163		}
164	21x	if (pairwise) {
165	6x	sim <- lma_simets(s, s[su, ], metric = "cosine", pairwise = TRUE, symmetrical = TRUE)
166	4x	if (is.null(dim(sim))) sim <- as(t(t(sim)), "CsparseMatrix")
167	6x	diag(sim[su, , drop = FALSE]) <- 0
168	6x	ms <- min(sim)
169	6x	sim <- (sim - ms) / (max(sim) - ms) * sign(sim)
170	6x	diag(sim[su, , drop = FALSE]) <- 1
171		} else {
172	15x	sim <- lma_simets(s, colMeans(s[su, , drop = FALSE]), metric = "cosine", pairwise = TRUE, symmetrical = TRUE)
173		}
174	21x	if (i == 1) {
175	6x	aggsim <- sim
176		} else {
177	15x	aggsim <- aggsim + sim
178		}
179		}
180	3x	if (is.null(dim(aggsim))) aggsim <- as(t(t(aggsim)), "CsparseMatrix")
181	6x	aggsim / length(space)
182		}
183		})
184		} else {
185	3x	space <- as(space, "CsparseMatrix")
186	!	if (!suggest) space_terms <- rownames(space)
187	3x	if (verbose) cat("calculating term similarities (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
188	3x	sims <- lapply(cat_names, function(cat) {
189	9x	su <- space_terms %in% dict_exp[[cat]]
190	9x	if (any(su)) {
191	7x	if (dimension_prop < 1) {
192	3x	loadings <- colSums(space[su, , drop = FALSE])
193	3x	dsu <- order(loadings, decreasing = TRUE)[seq(1, max(1, ceiling(ncol(space) * dimension_prop)))]
194	3x	space <- space[, dsu, drop = FALSE]
195		}
196	7x	sim <- lma_simets(
197	7x	space, if (pairwise) space[su, , drop = FALSE] else colMeans(space[su, , drop = FALSE]),
198	7x	metric = "cosine", pairwise = TRUE, symmetrical = TRUE
199		)
200	3x	if (is.null(dim(sim))) sim <- as(t(t(sim)), "CsparseMatrix")
201	7x	sim
202		}
203		})
204		}
205	5x	if (suggest) {
206	4x	if (verbose) cat("identifying potential additions (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
207	4x	if (!suggest_stopwords) is_stop <- lma_dict(as.function = TRUE)
208	4x	full_loadings <- do.call(cbind, lapply(sims, function(x) {
209	13x	if (length(x)) {
210	10x	agg <- rowMeans(x)
211	10x	agg[agg < 0] <- 0
212	10x	agg
213		} else {
214	3x	structure(numeric(length(space_terms)), names = space_terms)
215		}
216		}))
217	4x	loading_cat <- names(cat_names)[max.col(full_loadings)]
218	4x	suggested <- lapply(cat_names, function(cat) {
219	13x	s <- sims[[cat]]
220	13x	if (length(s)) {
221	10x	su <- !rownames(s) %in% dict_exp[[cat]] & loading_cat == cat
222	10x	loadings <- sort(if (suggest_discriminate && ncol(full_loadings) > 1) {
223	9x	nl <- full_loadings[su, colnames(full_loadings) != cat, drop = FALSE]
224	9x	(rowMeans(s[su, , drop = FALSE]) - nl[
225	9x	rep(seq_len(ncol(nl)), each = nrow(nl)) == max.col(nl)
226	9x	]) / 2
227		} else {
228	1x	rowMeans(s[su, , drop = FALSE])
229	10x	}, TRUE)
230	10x	if (length(loadings)) {
231	10x	if (!suggest_stopwords) loadings <- loadings[!is_stop(names(loadings))]
232	10x	co <- min(length(loadings), max(which(loadings > 0)), suggestion_terms)
233	10x	loadings[loadings > loadings[co] + Reduce("-", range(loadings[seq(1, co)])) / 2]
234		}
235		}
236		})
237		}
238	5x	if (verbose) cat("preparing results (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
239	5x	match_counts <- vapply(matches, length, 0)
240	5x	term_summary <- data.frame(
241	5x	terms[rep(seq_len(nrow(terms)), match_counts), c("category", "term")],
242	5x	match = unlist(lapply(matches, names)),
243	5x	stringsAsFactors = FALSE
244		)
245	5x	term_summary <- cbind(term_summary, do.call(rbind, lapply(
246	5x	split(
247	5x	term_summary[, c("category", "match", "term")],
248	5x	term_summary$category
249	5x	)[unique(term_summary$category)],
250	5x	function(cl) {
251	13x	cat <- cl$category[[1]]
252	13x	if (pairwise) {
253	10x	s <- sims[[cat]]
254		} else {
255	3x	su <- space_terms %in% dict_exp[[cat]]
256	3x	if (multi) {
257	3x	aggsim <- NULL
258	3x	for (i in seq_along(space)) {
259	15x	s <- space[[i]]
260	15x	if (sum(su) == 1) {
261	5x	sim <- Matrix(1, 1, dimnames = as.list(rep(rownames(s)[su], 2)), sparse = TRUE)
262		} else {
263	10x	sim <- lma_simets(s[su, , drop = FALSE], metric = "cosine", pairwise = TRUE, symmetrical = TRUE)
264	!	if (is.null(dim(sim))) sim <- as(t(t(sim)), "CsparseMatrix")
265	10x	diag(sim) <- 0
266	10x	ms <- min(sim)
267	10x	sim <- (sim - ms) / (max(sim) - ms) * sign(sim)
268	10x	diag(sim) <- 1
269		}
270	15x	if (i == 1) {
271	3x	aggsim <- sim
272		} else {
273	12x	aggsim <- aggsim + sim
274		}
275		}
276	3x	s <- aggsim / length(space)
277		} else {
278	!	s <- lma_simets(space[su, , drop = FALSE], metric = "cosine", pairwise = TRUE, symmetrical = TRUE)
279		}
280		}
281	13x	if (is.null(s)) {
282	!	cbind(sim.term = cl$match, sim.category = 0)
283		} else {
284	13x	su <- !(cl$match %in% rownames(s))
285	13x	if (any(su)) {
286	!	s <- rbind(s, Matrix(
287	!	0, sum(su), ncol(s),
288	!	dimnames = list(cl$match[su], colnames(s)), sparse = TRUE
289		))
290		}
291	13x	term_sims <- unlist(lapply(unname(split(cl$match, cl$term)[unique(cl$term)]), function(l) {
292	24x	if (length(l) == 1) {
293	20x	structure(1, names = l)
294		} else {
295	4x	cols <- l[l %in% colnames(s)]
296	4x	s[l, cols[which.min(nchar(cols))]]
297		}
298		}))
299	13x	cat_sims <- s[cl$match, which.max(if (is.null(dim(s))) {
300	!	s
301	13x	} else if (ncol(s) == 1) {
302	6x	1
303		} else {
304	7x	colMeans(s[colnames(s), , drop = FALSE])
305		})]
306	13x	cbind(sim.term = term_sims, sim.category = if (is.null(cat_sims)) 0 else cat_sims)
307		}
308		}
309		)))
310	5x	summary <- cbind(data.frame(
311	5x	category = cat_names,
312	5x	n_terms = vapply(dict, length, 0),
313	5x	n_expanded = tapply(match_counts, terms$category, sum)[cat_names],
314	5x	sim.space = space_name,
315	5x	stringsAsFactors = FALSE
316	5x	), sim = do.call(rbind, lapply(sims, function(s) {
317	17x	if (length(s) && !is.null(ncol(s)) && ncol(s) == 1) {
318	8x	m <- s[matched_terms, 1]
319	9x	} else if (length(s)) {
320	5x	s <- s[colnames(s), , drop = FALSE]
321	5x	m <- (colSums(s) - 1) / (ncol(s) - 1)
322		} else {
323	4x	m <- 0
324		}
325	17x	structure(summary(m), names = c("min", "q1", "median", "mean", "q3", "max"))
326		})))
327	5x	if (verbose) cat("done (", round(proc.time()[[3]] - st, 4), ")\n", sep = "")
328	5x	list(expanded = dict_exp, summary = summary, terms = term_summary, suggested = if (suggest) suggested)
329		}

1		#' Document-Term Matrix Creation
2		#'
3		#' Creates a document-term matrix (dtm) from a set of texts.
4		#' @param text Texts to be processed. This can be a vector (such as a column in a data frame)
5		#' or list. When a list, these can be in the form returned with \code{tokens.only = TRUE},
6		#' or a list with named vectors, where names are tokens and values are frequencies or the like.
7		#' @param exclude A character vector of words to be excluded. If \code{exclude} is a single string
8		#' matching \code{'function'}, \code{lma_dict(1:9)} will be used.
9		#' @param context A character vector used to reformat text based on look- ahead/behind. For example,
10		#' you might attempt to disambiguate \emph{like} by reformatting certain \emph{like}s
11		#' (e.g., \code{context = c('(i) like', '(you) like', '(do) like')}, where words in parentheses are
12		#' the context for the target word, and asterisks denote partial matching). This would be converted
13		#' to regular expression (i.e., \code{'(? <= i) like\\\\b'}) which, if matched, would be
14		#' replaced with a coded version of the word (e.g., \code{"Hey, i like that!"} would become
15		#' \code{"Hey, i i-like that!"}). This would probably only be useful for categorization, where a
16		#' dictionary would only include one or another version of a word (e.g., the LIWC 2015 dictionary
17		#' does something like this with \emph{like}, and LIWC 2007 did something like this with
18		#' \emph{kind (of)}, both to try and clean up the posemo category).
19		#' @param replace.special Logical: if \code{TRUE}, special characters are replaced with regular
20		#' equivalents using the \code{\link{lma_dict}} special function.
21		#' @param numbers Logical: if \code{TRUE}, numbers are preserved.
22		#' @param punct Logical: if \code{TRUE}, punctuation is preserved.
23		#' @param urls Logical: if \code{FALSE}, attempts to replace all urls with "repurl".
24		#' @param emojis Logical: if \code{TRUE}, attempts to replace emojis (e.g., ":(" would be replaced
25		#' with "repfrown").
26		#' @param to.lower Logical: if \code{FALSE}, words with different capitalization are treated as
27		#' different terms.
28		#' @param word.break A regular expression string determining the way words are split. Default is
29		#' \code{' +'} which breaks words at one or more blank spaces. You may also like to break by
30		#' dashes or slashes (\code{'[ /-]+'}), depending on the text.
31		#' @param dc.min Numeric: excludes terms appearing in the set number or fewer documents.
32		#' Default is 0 (no limit).
33		#' @param dc.max Numeric: excludes terms appearing in the set number or more. Default
34		#' is Inf (no limit).
35		#' @param sparse Logical: if \code{FALSE}, a regular dense matrix is returned.
36		#' @param tokens.only Logical: if \code{TRUE}, returns a list rather than a matrix, with these entries:
37		#' \tabular{ll}{
38		#' \code{tokens} \tab A vector of indices with terms as names. \cr
39		#' \code{frequencies} \tab A vector of counts with terms as names. \cr
40		#' \code{WC} \tab A vector of term counts for each document. \cr
41		#' \code{indices} \tab A list with a vector of token indices for each document. \cr
42		#' }
43		#' @note
44		#' This is a relatively simple way to make a dtm. To calculate the (more or less) standard forms of
45		#' LSM and LSS, a somewhat raw dtm should be fine, because both processes essentially use
46		#' dictionaries (obviating stemming) and weighting or categorization (largely obviating 'stop word'
47		#' removal). The exact effect of additional processing will depend on the dictionary/semantic space
48		#' and weighting scheme used (particularly for LSA). This function also does some processing which
49		#' may matter if you plan on categorizing with categories that have terms with look- ahead/behind assertions
50		#' (like LIWC dictionaries). Otherwise, other methods may be faster, more memory efficient, and/or more featureful.
51		#' @return A sparse matrix (or regular matrix if \code{sparse = FALSE}), with a row per \code{text},
52		#' and column per term, or a list if \code{tokens.only = TRUE}. Includes an attribute with options (\code{opts}),
53		#' and attributes with word count (\code{WC}) and column sums (\code{colsums}) if \code{tokens.only = FALSE}.
54		#' @examples
55		#' text <- c(
56		#' "Why, hello there! How are you this evening?",
57		#' "I am well, thank you for your inquiry!",
58		#' "You are a most good at social interactions person!",
59		#' "Why, thank you! You're not all bad yourself!"
60		#' )
61		#'
62		#' lma_dtm(text)
63		#'
64		#' # return tokens only
65		#' (tokens <- lma_dtm(text, tokens.only = TRUE))
66		#'
67		#' ## convert those to a regular DTM
68		#' lma_dtm(tokens)
69		#'
70		#' # convert a list-representation to a sparse matrix
71		#' lma_dtm(list(
72		#' doc1 = c(why = 1, hello = 1, there = 1),
73		#' doc2 = c(i = 1, am = 1, well = 1)
74		#' ))
75		#' @export
76
77		lma_dtm <- function(text, exclude = NULL, context = NULL, replace.special = FALSE, numbers = FALSE,
78		punct = FALSE, urls = TRUE, emojis = FALSE, to.lower = TRUE, word.break = " +", dc.min = 0,
79		dc.max = Inf, sparse = TRUE, tokens.only = FALSE) {
80	80x	if (!is.null(dim(text))) {
81	!	if (is.character(text[, 1]) \|\| is.factor(text[, 1])) {
82	!	text <- text[, 1]
83		} else {
84	!	stop("enter a vector of texts as the first argument")
85		}
86		}
87	80x	if (is.list(text)) {
88	4x	if (all(c("tokens", "indices") %in% names(text))) {
89	2x	m <- do.call(rbind, lapply(seq_along(text$indices), function(i) {
90	30x	if (length(text$indices[[i]])) {
91	28x	inds <- as.factor(text$indices[[i]])
92	28x	cbind(i, as.integer(levels(inds)), tabulate(inds))
93		}
94		}))
95	2x	dtm <- sparseMatrix(m[, 1], m[, 2],
96	2x	x = m[, 3], dims = c(length(text$indices), length(text$tokens)),
97	2x	dimnames = list(NULL, if (is.character(text$tokens)) text$tokens else names(text$tokens))
98		)
99	!	if (!sparse) dtm <- as.matrix(dtm)
100	2x	attr(dtm, "colsums") <- text$frequencies
101	2x	attr(dtm, "type") <- "count"
102	2x	attr(dtm, "WC") <- text$WC
103	2x	attr(dtm, "opts") <- attr(text, "opts")
104	2x	attr(dtm, "time") <- attr(text, "time")
105	2x	return(dtm)
106		} else {
107	2x	tokens <- unlist(unname(text), recursive = FALSE)
108	2x	cinds <- unique(names(tokens))
109	2x	if (is.null(cinds)) {
110	1x	text <- unlist(text)
111		} else {
112	1x	rinds <- rep(seq_along(text), vapply(text, length, 0))
113	1x	cinds <- structure(seq_along(cinds), names = cinds)
114	1x	dtm <- sparseMatrix(
115	1x	rinds, cinds[names(tokens)],
116	1x	x = tokens,
117	1x	dims = c(length(text), length(cinds)), dimnames = list(names(text), names(cinds))
118		)
119	1x	if (!sparse) dtm <- as.matrix(dtm)
120	1x	return(dtm)
121		}
122		}
123		}
124	!	if (is.null(text)) stop(substitute(text), " not found")
125	77x	docnames <- names(text)
126	77x	if (is.character(text) && all(nchar(text) < 500) && all(file.exists(text))) {
127	1x	text <- if (length(text) != 1 \|\| dir.exists(text)) read.segments(text) else readLines(text)
128		}
129	77x	text <- paste(" ", text, " ")
130	77x	st <- proc.time()[[3]]
131	!	if (replace.special) text <- lma_dict("special", as.function = gsub)(text)
132	77x	text <- gsub("(?<=[^a-z0-9])'\|'(?=[^a-z0-9])", '"', text, TRUE, TRUE)
133	77x	if (!urls) {
134	12x	text <- gsub(paste0(
135	12x	"\\s[a-z]+://[^\\s]\|www\\.[^\\s]\|\\s[a-z_~-]+\\.[a-z_~-]{2,}[^\\s]*\|\\s[a-z_~-]+\\.",
136	12x	"(?:io\|com\|net\|org\|gov\|edu)\\s"
137	12x	), " repurl ", text, TRUE, TRUE)
138	12x	text <- gsub("(?<=[A-Z])\\.\\s", " ", text, perl = TRUE)
139		}
140	77x	text <- gsub("\\s+", " ", text)
141	77x	text <- gsub("\\s(etc\|st\|rd\|ft\|feat\|dr\|drs\|mr\|ms\|mrs\|messrs\|jr\|prof)\\.", " \\1tempperiod", text, TRUE)
142	77x	text <- gsub("\\s\\.\|\\.\\s", " . ", text)
143	77x	if (any(punct, emojis, !is.null(context))) {
144	22x	special <- lma_dict(special)[[1]]
145	22x	if (!missing(context) && length(context) == 1 && grepl("like", context, TRUE)) {
146	!	context <- special[["LIKE"]]
147		}
148	18x	if (punct) text <- gsub(special[["ELLIPSIS"]], " repellipsis ", text)
149	22x	if (emojis) {
150	2x	for (type in c("SMILE", "FROWN")) {
151	4x	text <- gsub(
152	4x	special[[type]], paste0(" rep", tolower(type), " "), text,
153	4x	perl = TRUE
154		)
155		}
156		}
157	22x	if (!missing(context)) {
158	2x	if (!any(grepl("[?=]", context))) {
159	1x	context <- gsub("^\\(", "(?<=", context)
160	1x	context <- gsub("\\((?!\\?)", "(?=", context, perl = TRUE)
161	1x	context <- gsub("(?<![)*])$", "\\\\b", context, perl = TRUE)
162	1x	context <- gsub("\\", "\\\\w", context, perl = TRUE)
163		}
164	2x	context <- structure(
165	2x	as.list(context),
166	2x	names = paste("", gsub("--+", "-", gsub("[\\s^[]\|\\\\s]", "-",
167	2x	gsub("[^a-z0-9\\s\\\\']\|\\\\[wbs]", "", context, TRUE, TRUE),
168	2x	perl = TRUE
169		)), "")
170		)
171	2x	for (rn in names(context)) text <- gsub(context[[rn]], rn, text, perl = TRUE)
172		}
173		}
174	73x	if (to.lower) text <- tolower(text)
175	77x	if (!is.null(exclude)) {
176	3x	if (length(exclude) == 1 && grepl(exclude, "function", TRUE)) {
177	1x	exclude <- unlist(lma_dict(), use.names = FALSE)
178	2x	} else if (is.list(exclude)) exclude <- unlist(exclude, use.names = FALSE)
179		}
180	64x	if (!numbers) text <- gsub("[[:punct:]][0-9][0-9,.el-]", " ", text, TRUE, TRUE)
181	77x	text <- gsub(paste0(
182	77x	"([^a-z0-9.,':/?=#\\s-]\|[:/?=#](?=\\s)\|(?:(?<=\\s)[:/=-]\|,)(?=[a-z])\|(?<=[^a-z0-9])",
183	77x	"(,(?=[a-z0-9])\|[.-](?=[a-z]))\|[.,'-](?=[^0-9a-z]\|[.,'-]))"
184	77x	), " \\1 ", text, TRUE, TRUE)
185	77x	text <- gsub("(\\s[a-z]+)/([a-z]+\\s)", " \\1 / \\2 ", text, TRUE, TRUE)
186	77x	text <- gsub("([a-z0-9.,'-].*[^a-z0-9])", " \\1 ", text, TRUE, TRUE)
187	77x	text <- gsub("(?<=[a-z])\\s['\u00E7\u00ED]\\s(?=[a-z])", "'", text, TRUE, TRUE)
188	77x	if (!punct) {
189	59x	text <- gsub("[^A-Za-z0-9'._-]+", " ", text)
190	59x	text <- gsub("(?=[a-z])\\.+\|(?<=[^a-z0-9])['._-]+\|'+\\s", " ", text, TRUE, TRUE)
191		}
192	77x	text <- gsub("tempperiod", ".", text, fixed = TRUE)
193	77x	text <- gsub("^\\s+\|\\s(?=\\s)\|\\s+$", "", text, perl = TRUE)
194	77x	text <- strsplit(text, word.break)
195	77x	words <- sort(unique(unlist(text)))
196	77x	words <- words[!words == ""]
197	77x	if (!is.null(exclude)) {
198	!	if (is.list(exclude)) exclude <- unlist(exclude, use.names = FALSE)
199	!	if (!any(grepl("^", exclude, fixed = TRUE))) exclude <- gsub("\\^\\\|\\\\$", "", paste0("^", exclude, "$"))
200	3x	if (any(ck <- grepl("[[({]", exclude) + grepl("[})]\|\\]", exclude) == 1)) {
201	!	exclude[ck] <- gsub("([([{}\\])])", "\\\\\\1", exclude[ck], perl = TRUE)
202		}
203	3x	words <- grep(paste(exclude, collapse = "\|"), words, value = TRUE, invert = TRUE)
204		}
205	77x	if (tokens.only) {
206	2x	m <- match_terms(
207	2x	text, words, !grepl("^(?:[[:punct:]]\|repellipsis)$", words),
208	2x	c(length(text), length(words)), is.null(exclude), TRUE
209		)
210	2x	names(m) <- c("tokens", "frequencies", "WC", "indices")
211	2x	m$tokens <- m$tokens + 1L
212	2x	m$tokens <- sort(m$tokens)
213	2x	inds <- vector("list", length(text))
214	2x	l <- 0
215	2x	for (i in seq_along(inds)) {
216	30x	if (m$WC[i]) {
217	28x	inds[[i]] <- m$indices[seq_len(m$WC[i]) + l] + 1L
218	28x	l <- l + m$WC[i]
219		} else {
220	2x	inds[[i]] <- integer()
221		}
222		}
223	2x	m$indices <- inds
224	2x	if (dc.min > 0 \|\| dc.max < Inf) {
225	1x	su <- m$frequencies > dc.min & m$frequencies < dc.max
226	1x	if (any(!su)) {
227	1x	if (!any(su)) {
228	!	warning(
229	!	"document count bounds [", dc.min, ", ", dc.max, "] exlcuded all terms, so they were ignored",
230	!	call. = FALSE
231		)
232		} else {
233	1x	m$frequencies <- m$frequencies[su]
234	1x	ex <- m$tokens[!su]
235	1x	m$tokens <- m$tokens[su]
236	1x	new_inds <- structure(seq_along(m$tokens), names = m$tokens)
237	1x	m$tokens[] <- new_inds
238	1x	for (i in seq_along(m$indices)) {
239	15x	inds <- m$indices[[i]]
240	15x	m$indices[[i]] <- unname(new_inds[as.character(inds[!inds %in% ex])])
241	15x	m$WC[i] <- length(m$indices[[i]])
242		}
243		}
244		}
245		}
246	2x	names(m$frequencies) <- names(m$tokens)
247		} else {
248	75x	msu <- match_terms(
249	75x	text, words, !grepl("^(?:[[:punct:]]\|repellipsis)$", words),
250	75x	c(length(text), length(words)), is.null(exclude), FALSE
251		)
252	75x	m <- if (sparse) as(msu[[1]], "CsparseMatrix") else as.matrix(msu[[1]])
253	2x	if (length(docnames) == nrow(m)) rownames(m) <- docnames
254	75x	su <- msu[[3]] > dc.min & msu[[3]] < dc.max
255	75x	names(msu[[3]]) <- words
256	75x	if (any(!su)) {
257	1x	if (!any(su)) {
258	!	warning(
259	!	"document count bounds [", dc.min, ", ", dc.max, "] exlcuded all terms, so they were ignored",
260	!	call. = FALSE
261		)
262		} else {
263	1x	m <- m[, su, drop = FALSE]
264		}
265		}
266	75x	attr(m, "WC") <- unlist(msu[[2]], use.names = FALSE)
267	75x	attr(m, "colsums") <- msu[[3]]
268	75x	attr(m, "type") <- "count"
269	75x	if (!missing(dc.min) \|\| !missing(dc.max)) {
270	1x	attr(m, "info") <- paste(
271	1x	"a lim of", dc.min, "and", dc.max, "left", sum(su), "of", length(words), "unique terms"
272		)
273		}
274		}
275	77x	attr(m, "opts") <- c(numbers = numbers, punct = punct, urls = urls, to.lower = to.lower)
276	77x	attr(m, "time") <- c(dtm = proc.time()[[3]] - st)
277	77x	m
278		}

1		#' Initialize Directories for Dictionaries and Latent Semantic Spaces
2		#'
3		#' Creates directories for dictionaries and latent semantic spaces if needed, sets them as the
4		#' \cr \code{lingmatch.dict.dir} and \code{lingmatch.lspace.dir} options if they are not already set,
5		#' and creates links to them in their expected locations (\code{'~/Dictionaries'} and
6		#' \code{'~/Latent Semantic Spaces'}) by default if applicable.
7		#' @param base Path to a directory in which to create the \code{dict} and \code{lspace} subdirectories.
8		#' @param dict Path to the dictionaries directory relative to \code{base}.
9		#' @param lspace Path to the latent semantic spaces directory relative to \code{base}.
10		#' @param link Logical; if \code{TRUE} (default), the full \code{dict} and/or \code{lspace} paths exist
11		#' (potentially after being created), and they are not \code{'~/Dictionaries'} or \code{'~/Latent Semantic Spaces'}
12		#' respectively, junctions (Windows) or symbolic links will be created: \code{~/Dictionaries} \code{<<===>>}
13		#' \code{dict} and \code{~/Latent Semantic Spaces} \code{<<===>>} \code{lspace}.
14		#' @return Paths to the [1] dictionaries and [2] latent semantic space directories, or a single path if only
15		#' \code{dict} or \code{lspace} is specified.
16		#' @examples
17		#' \dontrun{
18		#'
19		#' # set up the expected dictionary and latent semantic space directories
20		#' lma_initdirs("~")
21		#'
22		#' # set up directories elsewhere, and links to the expected locations
23		#' lma_initdirs("d:")
24		#'
25		#' # point options and create links to preexisting directories
26		#' lma_initdirs("~/NLP_Resources", "Dicts", "Dicts/Embeddings")
27		#'
28		#' # create just a dictionaries directory and set the
29		#' # lingmatch.dict.dir option without creating a link
30		#' lma_initdirs(dict = "z:/external_dictionaries", link = FALSE)
31		#' }
32		#' @export
33
34		lma_initdirs <- function(base = "", dict = "Dictionaries", lspace = "Latent Semantic Spaces", link = TRUE) {
35	3x	mck <- c(missing(dict), missing(lspace))
36	3x	if (base == "" && all(mck)) {
37	!	base <- gsub('^[\'"]+\|[\'"]+$', "", readline(paste0(
38	!	"Enter the path to a directory; ~ is recommended: \n",
39	!	"This is where ", dict, " and ", lspace, " subdirectories will be made."
40		)))
41	!	if (grepl("^(?:cancel\|exit\|stop\|q\|x\|quit\|no\|nvm\|nevermind)?$", tolower(base))) {
42	!	stop(
43	!	"Specify a path to a directory in which you want dictionaries",
44	!	' and latent semantic spaces to be stored; e.g., "~".',
45	!	call. = FALSE
46		)
47		}
48		}
49	3x	dirs <- normalizePath(paste0(base, if (base != "") "/", c(dict, lspace)), "/", FALSE)
50	3x	names(dirs) <- c("dict", "lspace")
51	2x	if (!all(mck)) dirs <- dirs[!mck]
52	3x	if ("dict" %in% names(dirs)) {
53	2x	if (!dir.exists(dirs[["dict"]])) dir.create(dirs[["dict"]], recursive = TRUE)
54	1x	if (getOption("lingmatch.dict.dir") == "") options(lingmatch.dict.dir = dirs[["dict"]])
55		}
56	3x	if ("lspace" %in% names(dirs)) {
57	1x	if (!dir.exists(dirs[["lspace"]])) dir.create(dirs[["lspace"]], recursive = TRUE)
58	1x	if (getOption("lingmatch.lspace.dir") == "") options(lingmatch.lspace.dir = dirs[["lspace"]])
59		}
60	3x	if (link) {
61	!	linker <- if (Sys.info()[["sysname"]] == "Windows") Sys.junction else file.symlink
62	!	if (dir.exists(dirs[["dict"]]) && !dir.exists("~/Dictionaries")) {
63	!	linker(dirs[["dict"]], "~/Dictionaries")
64	!	message("created dictionaries link:\n ", dirs[["dict"]], " <<==>> ", path.expand("~/Dictionaries"))
65		}
66	!	if (dir.exists(dirs[["lspace"]]) && !dir.exists("~/Latent Semantic Spaces")) {
67	!	linker(dirs[["lspace"]], "~/Latent Semantic Spaces")
68	!	message("created latent space link:\n ", dirs[["lspace"]], " <<==>> ", path.expand("~/Latent Semantic Spaces"))
69		}
70		}
71	3x	dirs
72		}

1		#' Standardize a Latent Semantic Space
2		#'
3		#' Reformat a .rda file which has a matrix with terms as row names, or a plain-text embeddings file
4		#' which has a term at the start of each line, and consistent delimiting characters. Plain-text files
5		#' are processed line-by-line, so large spaces can be reformatted RAM-conservatively.
6		#'
7		#' @param infile Name of the .rda or plain-text file relative to \code{dir}, \cr
8		#' e.g., "default.rda" or "glove/glove.6B.300d.txt".
9		#' @param name Base name of the reformatted file and term file; e.g., "glove" would result in
10		#' \code{glove.dat} and \code{glove_terms.txt} in \code{outdir}.
11		#' @param sep Delimiting character between values in each line, e.g., \code{" "} or \code{"\\t"}.
12		#' Only applies to plain-text files.
13		#' @param digits Number of digits to round values to; default is 9.
14		#' @param dir Path to folder containing \code{infile}s. \cr Default is \code{getOption('lingmatch.lspace.dir')},
15		#' which must be set in the current session. If this is not specified and \code{infile} is a full path,
16		#' \code{dir} will be set to \code{infile}'s parent directory.
17		#' @param outdir Path to folder in which to save standardized files; default is \code{dir}.
18		#' @param remove A string with a regex pattern to be removed from term names \cr (i.e., \code{gsub(remove,}
19		#' \code{"", term)}); default is \code{""}, which is ignored.
20		#' @param term_check A string with a regex pattern by which to filter terms; i.e., only lines with fully
21		#' matched terms are written to the reformatted file. The default attempts to retain only regular words, including
22		#' those with dashes, foreword slashes, and periods. Set to an empty string (\code{""}) to write all lines
23		#' regardless of term.
24		#' @param verbose Logical: if \code{TRUE}, prints the current line number and its term to the console every 1,000 lines.
25		#' Only applies to plain-text files.
26		#' @family Latent Semantic Space functions
27		#' @return Path to the standardized [1] data file and [2] terms file if applicable.
28		#' @examples
29		#' \dontrun{
30		#'
31		#' # from https://sites.google.com/site/fritzgntr/software-resources/semantic_spaces
32		#' standardize.lspace("EN_100k_lsa.rda", "100k_lsa")
33		#'
34		#' # from https://fasttext.cc/docs/en/english-vectors.html
35		#' standardize.lspace("crawl-300d-2M.vec", "facebook_crawl")
36		#'
37		#' # Standardized versions of these spaces can also be downloaded with download.lspace.
38		#' }
39		#' @export
40
41		standardize.lspace <- function(infile, name, sep = " ", digits = 9, dir = getOption("lingmatch.lspace.dir"),
42		outdir = dir, remove = "", term_check = "^[a-zA-Z]+$\|^['a-zA-Z][a-zA-Z.'\\/-]*[a-zA-Z.]$", verbose = FALSE) {
43	3x	if (is.character(infile) && file.exists(infile) && missing(dir)) {
44	1x	dir <- dirname(normalizePath(infile, mustWork = FALSE))
45	1x	if (missing(outdir)) outdir <- dir
46	2x	} else if (dir == "") {
47	!	if (outdir != "") {
48	!	dir <- outdir
49		} else {
50	!	stop(paste(
51	!	"specify a directory (dir), or set the lspace directory option\n(e.g.,",
52	!	'options(lingmatch.lspace.dir = ~/Latent Semantic Spaces")) or call lma_initdir()'
53	!	), call. = FALSE)
54		}
55		}
56	!	if (!is.character(term_check)) term_check <- ""
57	3x	if (is.character(infile)) {
58	3x	ip <- normalizePath(paste0(dir, "/", infile), "/", FALSE)
59	1x	if (!file.exists(ip)) ip <- infile
60		}
61	3x	op <- normalizePath(paste0(outdir, "/", name), "/", FALSE)
62	3x	fs <- op
63	3x	if (!is.character(infile) \|\| grepl("\\.rda$", infile)) {
64	1x	if (is.character(infile)) {
65	1x	f <- load(ip)
66	1x	o <- get(f)
67		} else {
68	!	o <- infile
69		}
70	1x	o <- round(o, digits)
71	1x	ot <- rownames(o)
72	!	if (remove != "") ot <- gsub(remove, "", ot)
73	1x	if (term_check != "") {
74	1x	su <- grepl(term_check, ot)
75	1x	o <- o[su, ]
76	1x	ot <- ot[su]
77		}
78	1x	fs <- paste0(op, c(".dat", "_terms.txt"))
79	1x	writeLines(ot, fs[2])
80	1x	write(formatC(t(o), digits, 0, "f"), fs[1], ncol(o))
81	1x	if (is.character(infile)) rm(f, "o")
82		} else {
83	!	if (!file.exists(ip)) stop("infile does not exist: ", ip)
84	2x	if (!grepl(term_check, scan(ip, "", 1, sep = sep, quiet = TRUE))) {
85	!	stop("infile does not appear to start with a term: ", ip)
86		}
87	2x	fs <- paste0(op, ".dat")
88	2x	reformat_embedding(ip, op, sep, digits, remove, term_check, verbose)
89		}
90	3x	message("created ", op, ".dat\nfrom ", ip)
91	3x	fs
92		}

1		#' Read and Segment Multiple Texts
2		#'
3		#' Split texts by word count or specific characters. Input texts directly, or read them in from files.
4		#'
5		#' @param path Path to a folder containing files, or a vector of paths to files. If no folders or files are
6		#' recognized in \code{path}, it is treated as \code{text}.
7		#' @param segment Specifies how the text of each file should be segmented. If a character, split at that character;
8		#' '\\n' by default. If a number, texts will be broken into that many segments, each with a roughly equal number of
9		#' words.
10		#' @param ext The extension of the files you want to read in. '.txt' by default.
11		#' @param subdir Logical; if \code{TRUE}, files in folders in \code{path} will also be included.
12		#' @param segment.size Logical; if specified, \code{segment} will be ignored, and texts will be broken into
13		#' segments containing roughly \code{segment.size} number of words.
14		#' @param bysentence Logical; if \code{TRUE}, and \code{segment} is a number or \code{segment.size} is specified,
15		#' sentences will be kept together, rather than potentially being broken across segments.
16		#' @param end_in_quotes Logical; if \code{FALSE}, sentence-ending marks (\code{.?!}) will not be considered when
17		#' immediately followed by a quotation mark. For example, \code{'"Word." Word.'} would be considered one sentence.
18		#' @param preclean Logical; if \code{TRUE}, text will be cleaned with \code{lma_dict(special)} before
19		#' segmentation.
20		#' @param text A character vector with text to be split, used in place of \code{path}. Each entry is treated as a file.
21		#' @return A \code{data.frame} with columns for file names (\code{input}),
22		#' segment number within file (\code{segment}), word count for each segment (\code{WC}), and the text of
23		#' each segment (\code{text}).
24		#' @examples
25		#' # split preloaded text
26		#' read.segments("split this text into two segments", 2)
27		#'
28		#' \dontrun{
29		#'
30		#' # read in all files from the package directory
31		#' texts <- read.segments(path.package("lingmatch"), ext = "")
32		#' texts[, -4]
33		#'
34		#' # segment .txt files in dir in a few ways:
35		#' dir <- "path/to/files"
36		#'
37		#' ## into 1 line segments
38		#' texts_lines <- read.segments(dir)
39		#'
40		#' ## into 5 even segments each
41		#' texts_5segs <- read.segments(dir, 5)
42		#'
43		#' ## into 50 word segments
44		#' texts_50words <- read.segments(dir, segment.size = 50)
45		#'
46		#' ## into 1 sentence segments
47		#' texts_1sent <- read.segments(dir, segment.size = 1, bysentence = TRUE)
48		#' }
49		#' @export
50
51		read.segments <- function(path = ".", segment = NULL, ext = ".txt", subdir = FALSE, segment.size = -1,
52		bysentence = FALSE, end_in_quotes = TRUE, preclean = FALSE, text = NULL) {
53	!	if (any(path == "")) path[path == ""] <- "."
54	14x	if (!any(file.exists(sub("[\\/]+$", "", path)))) {
55	6x	ck_text <- TRUE
56	6x	files <- path
57		} else {
58	8x	ck_text <- !is.null(text)
59	8x	files <- if (ck_text) {
60	!	text
61		} else {
62	8x	dirs <- list.dirs(path, recursive = subdir)
63	8x	files <- if (any(dir.exists(path))) {
64	1x	unique(list.files(path, ext, recursive = subdir, full.names = TRUE))
65		} else {
66	7x	path[file.exists(path)]
67		}
68	8x	files[!files %in% dirs]
69		}
70		}
71	14x	if (missing(segment) && missing(segment.size)) {
72	4x	segment <- if (length(path) == 1 && (ck_text \|\| !dir.exists(path))) "\n" else 1
73		}
74	14x	if (length(files)) {
75	14x	err <- function(e) NULL
76	14x	args <- list(what = character(), quote = "", na.strings = "", quiet = TRUE)
77	!	if (is.character(segment) && segment.size == -1) args$sep <- segment
78	14x	do.call(rbind, lapply(seq_along(files), function(fi) {
79	44x	f <- files[fi]
80	44x	args[[if (ck_text) "text" else "file"]] <- f
81	44x	WC <- NULL
82	44x	if (ck_text \|\| file.exists(f)) {
83	44x	if (is.numeric(segment) \|\| segment.size > 0) {
84	44x	words <- tryCatch(do.call(scan, args), error = err)
85	44x	if (!length(words)) {
86	!	return(NULL)
87		}
88	!	if (preclean) words <- lma_dict("special", as.function = gsub)(words)
89	44x	TWC <- length(words)
90	32x	if (segment.size == -1) segment.size <- ceiling(TWC / segment)
91	44x	if (bysentence) {
92	8x	if (!is.null(segment) && is.numeric(segment)) {
93	4x	lines <- character(segment)
94	4x	WC <- numeric(segment)
95		} else {
96	4x	lines <- NULL
97	4x	WC <- NULL
98		}
99	8x	sentends <- grep("[.?!]$", if (end_in_quotes) {
100	7x	gsub(if (preclean) {
101		'["\']+'
102		} else {
103	7x	paste0("(?:", paste(
104	7x	c(
105		'["\']',
106	7x	unlist(lma_dict("special")$special$CHARACTERS[c('"', "'")])
107		),
108	7x	collapse = "\|"
109		), ")+$")
110	7x	}, "", words)
111		} else {
112	1x	words
113		})
114	8x	if (length(sentends)) {
115	6x	sentends <- sentends[!grepl(paste0(
116	6x	"\\.[a-z]\|^(?:[a-z]\|[ivxm]+\|\\d+\|ans\|govt\|apt\|etc\|",
117	6x	"st\|rd\|ft\|feat\|dr\|drs\|mr\|ms\|mrs\|messrs\|jr\|prof)[.?!]$"
118	6x	), words[sentends], TRUE, TRUE)]
119		}
120	8x	sentends <- c(1, sentends)
121	8x	nsents <- length(sentends)
122	8x	if (sentends[nsents] != TWC) {
123	3x	sentends <- c(sentends, TWC)
124	3x	nsents <- nsents + 1
125		}
126	8x	i <- s <- p <- 1
127	8x	while (p < nsents && sum(WC) < TWC) {
128	17x	WC[i] <- 0
129	17x	while (p < nsents && WC[i] < segment.size) {
130	56x	p <- p + 1
131	56x	WC[i] <- (sentends[p] - s) + 1
132		}
133	17x	lines[i] <- paste(words[seq(s, sentends[p])], collapse = " ")
134	17x	s <- sentends[p] + 1
135	17x	i <- i + 1
136		}
137		} else {
138	36x	segment <- ceiling(TWC / segment.size)
139	36x	lines <- character(segment)
140	36x	WC <- rep(segment.size, segment)
141	36x	WCC <- 0
142	36x	for (i in seq_len(segment)) {
143	14x	if (WCC + WC[i] > TWC) WC[i] <- TWC - WCC
144	92x	lines[i] <- paste(words[seq(WCC + 1, WCC + WC[i])], collapse = " ")
145	92x	WCC <- WCC + WC[i]
146		}
147		}
148		} else {
149	!	lines <- tryCatch(do.call(scan, args), error = err)
150	!	if (!length(lines)) {
151	!	return(NULL)
152		}
153		}
154		} else {
155	!	lines <- ""
156		}
157	44x	data.frame(
158	44x	input = if (ck_text) fi else f, segment = seq_along(lines),
159	44x	WC = if (is.null(WC)) vapply(strsplit(lines, "\\s+"), function(sp) sum(sp != ""), 0) else WC,
160	44x	text = lines, stringsAsFactors = FALSE
161		)
162		}))
163		} else {
164	!	warning(
165	!	"no files found", if (!subdir) "; might try setting subdir to TRUE to include files in folders"
166		)
167		}
168		}

1		#' Similarity Calculations
2		#'
3		#' Enter a numerical matrix, set of vectors, or set of matrices to calculate similarity per vector.
4		#'
5		#' @param a A vector or matrix. If a vector, \code{b} must also be provided. If a matrix and \code{b}
6		#' is missing, each row will be compared. If a matrix and \code{b} is not missing, each row will
7		#' be compared with \code{b} or each row of \code{b}.
8		#' @param b A vector or matrix to be compared with \code{a} or rows of \code{a}.
9		#' @param metric A character or vector of characters at least partially matching one of the
10		#' available metric names (or 'all' to explicitly include all metrics),
11		#' or a number or vector of numbers indicating the metric by index:
12		#' \itemize{
13		#' \item \strong{\code{jaccard}}: \code{sum(a & b) / sum(a \| b)}
14		#' \item \strong{\code{euclidean}}: \code{1 / (1 + sqrt(sum((a - b) ^ 2)))}
15		#' \item \strong{\code{canberra}}: \code{mean(1 - abs(a - b) / (a + b))}
16		#' \item \strong{\code{cosine}}: \code{sum(a * b) / sqrt(sum(a ^ 2 * sum(b ^ 2)))}
17		#' \item \strong{\code{pearson}}: \code{(mean(a * b) - (mean(a) * mean(b))) /} \cr
18		#' \code{sqrt(mean(a ^ 2) - mean(a) ^ 2) / sqrt(mean(b ^ 2) - mean(b) ^ 2)}
19		#' }
20		#' @param group If \code{b} is missing and \code{a} has multiple rows, this will be used to make
21		#' comparisons between rows of \code{a}, as modified by \code{agg} and \code{agg.mean}.
22		#' @param lag Amount to adjust the \code{b} index; either rows if \code{b} has multiple rows (e.g.,
23		#' for \code{lag = 1}, \code{a[1, ]} is compared with \code{b[2, ]}), or values otherwise (e.g.,
24		#' for \code{lag = 1}, \code{a[1]} is compared with \code{b[2]}). If \code{b} is not supplied,
25		#' \code{b} is a copy of \code{a}, resulting in lagged self-comparisons or autocorrelations.
26		#' @param agg Logical: if \code{FALSE}, only the boundary rows between groups will be compared, see
27		#' example.
28		#' @param agg.mean Logical: if \code{FALSE} aggregated rows are summed instead of averaged.
29		#' @param pairwise Logical: if \code{FALSE} and \code{a} and \code{b} are matrices with the same number of
30		#' rows, only paired rows are compared. Otherwise (and if only \code{a} is supplied), all pairwise
31		#' comparisons are made.
32		#' @param symmetrical Logical: if \code{TRUE} and pairwise comparisons between \code{a} rows were made,
33		#' the results in the lower triangle are copied to the upper triangle.
34		#' @param mean Logical: if \code{TRUE}, a single mean for each metric is returned per row of \code{a}.
35		#' @param return.list Logical: if \code{TRUE}, a list-like object will always be returned, with an entry
36		#' for each metric, even when only one metric is requested.
37		#' @details
38		#' Use \code{\link[RcppParallel]{setThreadOptions}} to change parallelization options; e.g., run
39		#' RcppParallel::setThreadOptions(4) before a call to lma_simets to set the number of CPU
40		#' threads to 4.
41		#' @return Output varies based on the dimensions of \code{a} and \code{b}:
42		#' \itemize{
43		#' \item \strong{Out:} A vector with a value per metric. \cr
44		#' \strong{In:} Only when \code{a} and \code{b} are both vectors.
45		#' \item \strong{Out:} A vector with a value per row. \cr
46		#' \strong{In:} Any time a single value is expected per row: \code{a} or \code{b} is a vector,
47		#' \code{a} and \code{b} are matrices with the same number of rows and \code{pairwise = FALSE}, a group is
48		#' specified, or \code{mean = TRUE}, and only one metric is requested.
49		#' \item \strong{Out:} A data.frame with a column per metric. \cr
50		#' \strong{In:} When multiple metrics are requested in the previous case.
51		#' \item \strong{Out:} A sparse matrix with a \code{metric} attribute with the metric name. \cr
52		#' \strong{In:} Pairwise comparisons within an \code{a} matrix or between
53		#' an \code{a} and \code{b} matrix, when only 1 metric is requested.
54		#' \item \strong{Out:} A list with a sparse matrix per metric. \cr
55		#' \strong{In:} When multiple metrics are requested in the previous case.
56		#' }
57		#' @examples
58		#' text <- c(
59		#' "words of speaker A", "more words from speaker A",
60		#' "words from speaker B", "more words from speaker B"
61		#' )
62		#' (dtm <- lma_dtm(text))
63		#'
64		#' # compare each entry
65		#' lma_simets(dtm)
66		#'
67		#' # compare each entry with the mean of all entries
68		#' lma_simets(dtm, colMeans(dtm))
69		#'
70		#' # compare by group (corresponding to speakers and turns in this case)
71		#' speaker <- c("A", "A", "B", "B")
72		#'
73		#' ## by default, consecutive rows from the same group are averaged:
74		#' lma_simets(dtm, group = speaker)
75		#'
76		#' ## with agg = FALSE, only the rows at the boundary between
77		#' ## groups (rows 2 and 3 in this case) are used:
78		#' lma_simets(dtm, group = speaker, agg = FALSE)
79		#' @export
80
81		lma_simets <- function(a, b = NULL, metric = NULL, group = NULL, lag = 0, agg = TRUE, agg.mean = TRUE,
82		pairwise = TRUE, symmetrical = FALSE, mean = FALSE, return.list = FALSE) {
83	515x	mets <- c("jaccard", "euclidean", "canberra", "cosine", "pearson")
84	515x	if (missing(metric) && length(b) == 1 && !grepl(" ", b) &&
85	515x	any(grepl(tolower(substr(b, 1, 3)), mets, fixed = TRUE))) {
86	8x	metric <- b
87	8x	b <- NULL
88		}
89	515x	met <- match_metric(metric)
90	515x	if (!length(met$selected)) {
91	!	stop(
92	!	"no recognized metric; should match one of ",
93	!	paste0(mets, collapse = ", "), ", or all"
94		)
95		}
96	515x	st <- proc.time()[[3]]
97	515x	slots <- c("i", "p", "x", "Dim")
98	515x	if ((is.character(a) \|\| is.factor(a)) && any(grepl("[a-zA-Z]", a))) {
99	2x	a <- lma_dtm(a)
100	75x	} else if (is.data.frame(a)) a <- Matrix(as.matrix(a), sparse = TRUE)
101	1x	if (is.null(b) && !missing(lag) && is.null(dim(a))) b <- a
102	515x	if (is.null(b)) {
103	114x	n <- dim(a)[1]
104	!	if (is.null(n) \|\| n < 2) stop("a must have more than 1 row when b is not provided", call. = FALSE)
105	114x	if (is.null(group)) {
106	63x	if (!all(slots %in% slotNames(a))) a <- as(a, "CsparseMatrix")
107	77x	res <- calculate_similarities(a, NULL, 2, met$dummy)
108	77x	for (i in seq_along(res)) attr(res[[i]], "metric") <- met$selected[i]
109		} else {
110	!	if (length(group) != n) stop("group is not the same length as a or columns in a")
111	37x	ager <- if (agg.mean) colMeans else colSums
112	37x	l <- length(group)
113	37x	chunks <- NULL
114	37x	i <- 1
115	37x	while (i < l) {
116	193x	st <- i
117	193x	g <- group[i]
118	193x	while (i < l && g == group[i + 1]) i <- i + 1
119	193x	chunks <- c(chunks, list(seq(st, i)))
120	193x	i <- i + 1
121		}
122	25x	if (!any(chunks[[length(chunks)]] == l)) chunks <- c(chunks, list(l))
123	37x	rows <- character(length(chunks) - 1)
124	37x	res <- as.data.frame(matrix(0, length(chunks) - 1, sum(met$dummy), dimnames = list(NULL, met$selected)))
125	37x	for (i in seq_len(length(chunks) - 1)) {
126	181x	s <- chunks[[i]]
127	181x	sa <- if (agg) s else s[length(s)]
128	181x	ta <- ager(a[sa, , drop = FALSE])
129	181x	s <- chunks[[i + 1]]
130	181x	sb <- if (agg) s else s[1]
131	181x	tb <- ager(a[sb, , drop = FALSE])
132	181x	res[i, ] <- vector_similarity(ta, tb, met$dummy)
133	181x	rows[i] <- paste(paste(sa, collapse = ", "), "<->", paste(sb, collapse = ", "))
134		}
135	37x	rownames(res) <- rows
136		}
137		} else {
138	401x	if ((is.character(b) \|\| is.factor(b)) && any(grepl("[a-zA-Z]", b))) {
139	1x	b <- lma_dtm(b)
140	7x	} else if (is.data.frame(b)) b <- Matrix(as.matrix(b), sparse = TRUE)
141	401x	bn <- if (is.null(dim(b))) length(b) else dim(b)[1]
142	4x	if (lag && abs(lag) >= bn) lag <- if (lag < 0) -bn + 1 else bn - 1
143	401x	res <- if (is.null(dim(b)) && length(a) == bn && (is.null(dim(a)) \|\| any(dim(a) == 1))) {
144	247x	b <- as.numeric(b)
145	5x	if (lag) b <- if (lag < 0) c(b[-seq_len(-lag)], numeric(-lag)) else c(numeric(lag), b)[seq_len(bn)]
146	247x	vector_similarity(as.numeric(a), b, met$dummy)
147		} else {
148	2x	if (is.null(dim(a))) a <- Matrix(a, 1, dimnames = list(NULL, names(a)), sparse = TRUE)
149	85x	if (!all(slots %in% slotNames(a))) a <- as(a, "CsparseMatrix")
150	76x	if (is.null(dim(b))) b <- Matrix(b, 1, dimnames = list(NULL, names(b)), sparse = TRUE)
151	28x	if (!all(slots %in% slotNames(b))) b <- as(b, "CsparseMatrix")
152	154x	d <- c(dim(a), dim(b))
153	154x	if (d[2] != d[4]) {
154	3x	ns <- colnames(a)
155	3x	if (!is.null(ns)) {
156	3x	ns <- ns[ns %in% colnames(b)]
157	3x	if (length(ns)) {
158	3x	a <- a[, ns, drop = FALSE]
159	3x	b <- b[, ns, drop = FALSE]
160		}
161		}
162	3x	d <- c(dim(a), dim(b))
163	3x	if (d[2] != d[4]) {
164	!	stop("a and b have a different number of columns, which could not be aligned by name")
165		}
166		}
167	154x	if (lag) {
168	4x	b <- if (lag > 0) {
169	2x	rbind(Matrix(0, lag, d[4], sparse = TRUE), b[-(seq_len(lag) + d[3] - lag), ])
170		} else {
171	2x	rbind(b[-seq_len(-lag), ], Matrix(0, -lag, d[4], sparse = TRUE))
172		}
173		}
174	154x	type <- if (((missing(pairwise) \|\| !pairwise) && d[1] == d[3]) \|\|
175	154x	d[3] == 1) {
176	105x	1
177		} else {
178	49x	3
179		}
180	154x	calculate_similarities(a, b, type, met$dummy)
181		}
182		}
183	515x	if ("list" %in% class(res) && length(res)) {
184	231x	pairwise <- "dtCMatrix" %in% class(res[[1]])
185	231x	if ((pairwise && symmetrical) \|\| mean) {
186	33x	for (i in seq_along(res)) {
187	32x	if (pairwise && (symmetrical \|\| mean)) res[[i]] <- forceSymmetric(res[[i]], "L")
188	33x	if (mean) {
189	3x	res[[i]] <- if (is.null(dim(res[[i]]))) {
190	!	mean(res[[i]], na.rm = TRUE)
191		} else {
192	3x	(rowSums(res[[i]], TRUE) - 1) / (ncol(res[[i]]) - 1)
193		}
194		}
195		}
196		}
197	231x	if (is.null(dim(res[[1]]))) {
198	111x	rn <- if (!is.na(nd <- which(c(dim(a), dim(b)) == length(res[[1]]))[1]) && !is.null(rownames(if (nd == 1) a else b))) {
199	37x	rownames(if (nd == 1) a else b)
200		} else {
201	74x	NULL
202		}
203	111x	if (length(met$selected) == 1) {
204	37x	if (length(rn) == length(res[[1]])) names(res[[1]]) <- rn
205		} else {
206	17x	attr(res, "row.names") <- if (length(rn) == length(res[[1]])) rn else seq_along(res[[1]])
207	17x	attr(res, "class") <- "data.frame"
208		}
209		}
210	210x	if (!return.list && length(met$selected) == 1) res <- res[[1]]
211		}
212	515x	attr(res, "time") <- c(simets = proc.time()[[3]] - st)
213	515x	res
214		}

1		#' Select Latent Semantic Spaces
2		#'
3		#' Retrieve information and links to latent semantic spaces
4		#' (sets of word vectors/embeddings) available at \href{https://osf.io/489he}{osf.io/489he},
5		#' and optionally download their term mappings (\href{https://osf.io/xr7jv}{osf.io/xr7jv}).
6		#'
7		#' @param query A character used to select spaces, based on names or other features.
8		#' If length is over 1, \code{get.map} is set to \code{TRUE}. Use \code{terms} alone to select
9		#' spaces based on term coverage.
10		#' @param dir Path to a directory containing \code{lma_term_map.rda} and downloaded spaces; \cr will look in
11		#' \code{getOption('lingmatch.lspace.dir')} and \code{'~/Latent Semantic Spaces'} by default.
12		#' @param terms A character vector of terms to search for in the downloaded term map, to calculate
13		#' coverage of spaces, or select by coverage if \code{query} is not specified.
14		#' @param get.map Logical; if \code{TRUE} and \code{lma_term_map.rda} is not found in
15		#' \code{dir}, the term map (\href{https://osf.io/xr7jv}{lma_term_map.rda}) is
16		#' downloaded and decompressed.
17		#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF,
18		#' and compares it with that calculated from the downloaded file to check its integrity.
19		#' @param mode Passed to \code{\link{download.file}} when downloading the term map.
20		#' @return A list with varying entries:
21		#' \itemize{
22		#' \item \strong{\code{info}}: The version of \href{https://osf.io/9yzca}{osf.io/9yzca} stored internally; a
23		#' \code{data.frame} with spaces as row names, and information about each space in columns:
24		#' \itemize{
25		#' \item \strong{\code{terms}}: number of terms in the space
26		#' \item \strong{\code{corpus}}: corpus(es) on which the space was trained
27		#' \item \strong{\code{model}}: model from which the space was trained
28		#' \item \strong{\code{dimensions}}: number of dimensions in the model (columns of the space)
29		#' \item \strong{\code{model_info}}: some parameter details about the model
30		#' \item \strong{\code{original_max}}: maximum value used to normalize the space; the original
31		#' space would be \code{(vectors *} \code{original_max) /} \code{100}
32		#' \item \strong{\code{osf_dat}}: OSF id for the \code{.dat} files; the URL would be
33		#' https://osf.io/\code{osf_dat}
34		#' \item \strong{\code{osf_terms}}: OSF id for the \code{_terms.txt} files; the URL would be
35		#' https://osf.io/\code{osf_terms}
36		#' \item \strong{\code{wiki}}: link to the wiki for the space
37		#' \item \strong{\code{downloaded}}: path to the \code{.dat} file if downloaded,
38		#' and \code{''} otherwise.
39		#' }
40		#' \item \strong{\code{selected}}: A subset of \code{info} selected by \code{query}.
41		#' \item \strong{\code{term_map}}: If \code{get.map} is \code{TRUE} or \code{lma_term_map.rda} is found in
42		#' \code{dir}, a copy of \href{https://osf.io/xr7jv}{osf.io/xr7jv}, which has space names as
43		#' column names, terms as row names, and indices as values, with 0 indicating the term is not
44		#' present in the associated space.
45		#' }
46		#' @family Latent Semantic Space functions
47		#' @examples
48		#' # just retrieve information about available spaces
49		#' spaces <- select.lspace()
50		#' spaces$info[1:10, c("terms", "dimensions", "original_max")]
51		#'
52		#' # retrieve all spaces that used word2vec
53		#' w2v_spaces <- select.lspace("word2vec")$selected
54		#' w2v_spaces[, c("terms", "dimensions", "original_max")]
55		#'
56		#' \dontrun{
57		#'
58		#' # select spaces by terms
59		#' select.lspace(terms = c(
60		#' "part-time", "i/o", "'cause", "brexit", "debuffs"
61		#' ))$selected[, c("terms", "coverage")]
62		#' }
63		#' @export
64
65		select.lspace <- function(query = NULL, dir = getOption("lingmatch.lspace.dir"), terms = NULL,
66		get.map = FALSE, check.md5 = TRUE, mode = "wb") {
67	16x	if (ckd <- dir == "") dir <- "~/Latent Semantic Spaces"
68	17x	if (!missing(query) && !is.character(query) && !is.null(colnames(query))) {
69	!	terms <- colnames(query)
70	!	query <- NULL
71		}
72	17x	map_path <- normalizePath(paste0(dir, "/lma_term_map.rda"), "/", FALSE)
73	5x	if (missing(get.map) && (file.exists(map_path) \|\| length(terms) > 1)) get.map <- TRUE
74	17x	if (!exists("lma_term_map")) lma_term_map <- NULL
75	!	if (get.map && ckd && !dir.exists(dir)) stop("specify `dir` or use `lma_initdirs()` to download the term map")
76	17x	if (get.map && !(file.exists(map_path) \|\| !is.null(lma_term_map))) {
77	1x	fi <- tryCatch(
78	1x	strsplit(readLines("https://api.osf.io/v2/files/xr7jv", 1, TRUE, FALSE, "utf-8"), '[:,{}"]+')[[1]],
79	1x	error = function(e) NULL
80		)
81	1x	if (!file.exists(map_path) \|\| (!is.null(fi) && md5sum(map_path) != fi[which(fi == "md5") + 1])) {
82	1x	dir.create(dir, FALSE, TRUE)
83	1x	status <- tryCatch(download.file(
84	1x	"https://osf.io/download/xr7jv", map_path,
85	1x	mode = mode
86	1x	), error = function(e) 1)
87	1x	if (!status && check.md5 && !is.null(fi)) {
88	1x	ck <- md5sum(map_path)
89	1x	if (fi[which(fi == "md5") + 1] == ck) {
90	1x	load(map_path)
91	1x	save(lma_term_map, file = map_path, compress = FALSE)
92		} else {
93	!	warning(paste0(
94	!	"The term map's MD5 (", ck, ") does not seem to match the one on record;\n",
95	!	"double check and try manually downloading at https://osf.io/xr7jv/?show=revision"
96		))
97		}
98		}
99		}
100	16x	} else if (!file.exists(map_path) && !is.null(terms)) {
101	!	stop("The term map could not be found; specify dir or run lma_initdirs('~') to download it", call. = FALSE)
102		}
103	17x	r <- list(info = lss_info, selected = lss_info[NULL, ])
104	17x	r$info[, "wiki"] <- paste0("https://osf.io/489he/wiki/", rownames(lss_info))
105	17x	r$info[, "downloaded"] <- normalizePath(paste0(dir, "/", rownames(r$info), ".dat"), "/", FALSE)
106	17x	r$info[!file.exists(r$info[, "downloaded"]), "downloaded"] <- ""
107	17x	if (get.map) {
108	10x	if (!is.null(lma_term_map)) {
109	1x	r$term_map <- lma_term_map
110	9x	} else if (file.exists(map_path) && is.null(lma_term_map)) {
111	9x	load(map_path)
112	9x	r$term_map <- lma_term_map
113	9x	rm(list = "lma_term_map")
114		}
115		}
116	17x	if (!is.null(terms)) {
117	3x	if (length(terms) > 1 && "term_map" %in% names(r)) {
118	3x	terms <- tolower(terms)
119	3x	overlap <- terms[terms %in% rownames(r$term_map)]
120	3x	if (length(overlap)) {
121	3x	r$info$coverage <- colSums(r$term_map[overlap, , drop = FALSE] != 0) / length(terms)
122	3x	r$selected <- r$info[order(r$info$coverage, decreasing = TRUE)[1:5], ]
123	3x	r$space_terms <- overlap
124		} else {
125	!	warning("no terms were found")
126		}
127		}
128		}
129	17x	if (!is.null(query)) {
130	8x	query <- paste0(query, collapse = "\|")
131	8x	if (!length(sel <- grep(query, rownames(lss_info), TRUE))) {
132	3x	collapsed <- vapply(
133	3x	seq_len(nrow(lss_info)),
134	3x	function(r) paste(c(rownames(lss_info)[r], lss_info[r, ]), collapse = " "), ""
135		)
136	3x	if (!length(sel <- grep(query, collapsed, TRUE))) {
137	2x	sel <- grep(paste(strsplit(query, "[[:space:],\|]+")[[1]], collapse = "\|"), collapsed, TRUE)
138		}
139		}
140	8x	if (length(sel)) r$selected <- r$info[sel, ]
141		}
142	17x	r
143		}

1		#' Select Dictionaries
2		#'
3		#' Retrieve information and links to dictionaries
4		#' (lexicons/word lists) available at \href{https://osf.io/y6g5b}{osf.io/y6g5b}.
5		#'
6		#' @param query A character matching a dictionary name, or a set of keywords to search for in
7		#' dictionary information.
8		#' @param dir Path to a folder containing dictionaries, or where you want them to be saved.
9		#' Will look in getOption('lingmatch.dict.dir') and '~/Dictionaries' by default.
10		#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF,
11		#' and compares it with that calculated from the downloaded file to check its integrity.
12		#' @param mode Passed to \code{\link{download.file}} when downloading files.
13		#' @return A list with varying entries:
14		#' \itemize{
15		#' \item \strong{\code{info}}: The version of \href{https://osf.io/kjqb8}{osf.io/kjqb8} stored internally; a
16		#' \code{data.frame} with dictionary names as row names, and information about each dictionary in columns. \cr
17		#' Also described at
18		#' \href{https://osf.io/y6g5b/wiki/dict_variables}{osf.io/y6g5b/wiki/dict_variables},
19		#' here \code{short} (corresponding to the file name [\code{{short}.(csv\|dic)}] and
20		#' wiki urls [\code{https://osf.io/y6g5b/wiki/{short}}]) is set as row names and removed:
21		#' \itemize{
22		#' \item \strong{\code{name}}: Full name of the dictionary.
23		#' \item \strong{\code{description}}: Description of the dictionary, relating to its purpose and
24		#' development.
25		#' \item \strong{\code{note}}: Notes about processing decisions that additionally alter the original.
26		#' \item \strong{\code{constructor}}: How the dictionary was constructed:
27		#' \itemize{
28		#' \item \strong{\code{algorithm}}: Terms were selected by some automated process, potentially
29		#' learned from data or other resources.
30		#' \item \strong{\code{crowd}}: Several individuals rated the terms, and in aggregate those ratings
31		#' translate to categories and weights.
32		#' \item \strong{\code{mixed}}: Some combination of the other methods, usually in some iterative
33		#' process.
34		#' \item \strong{\code{team}}: One of more individuals make decisions about term inclusions,
35		#' categories, and weights.
36		#' }
37		#' \item \strong{\code{subject}}: Broad, rough subject or purpose of the dictionary:
38		#' \itemize{
39		#' \item \strong{\code{emotion}}: Terms relate to emotions, potentially exemplifying or expressing
40		#' them.
41		#' \item \strong{\code{general}}: A large range of categories, aiming to capture the content of the
42		#' text.
43		#' \item \strong{\code{impression}}: Terms are categorized and weighted based on the impression they
44		#' might give.
45		#' \item \strong{\code{language}}: Terms are categorized or weighted based on their linguistic
46		#' features, such as part of speech, specificity, or area of use.
47		#' \item \strong{\code{social}}: Terms relate to social phenomena, such as characteristics or concerns
48		#' of social entities.
49		#' }
50		#' \item \strong{\code{terms}}: Number of unique terms across categories.
51		#' \item \strong{\code{term_type}}: Format of the terms:
52		#' \itemize{
53		#' \item \strong{\code{glob}}: Include asterisks which denote inclusion of any characters until a
54		#' word boundary.
55		#' \item \strong{\code{glob+}}: Glob-style asterisks with regular expressions within terms.
56		#' \item \strong{\code{ngram}}: Includes any number of words as a term, separated by spaces.
57		#' \item \strong{\code{pattern}}: A string of characters, potentially within or between words, or
58		#' spanning words.
59		#' \item \strong{\code{regex}}: Regular expressions.
60		#' \item \strong{\code{stem}}: Unigrams with common endings removed.
61		#' \item \strong{\code{unigram}}: Complete single words.
62		#' }
63		#' \item \strong{\code{weighted}}: Indicates whether weights are associated with terms. This
64		#' determines the file type of the dictionary: dictionaries with weights are stored
65		#' as .csv, and those without are stored as .dic files.
66		#' \item \strong{\code{regex_characters}}: Logical indicating whether special regular expression
67		#' characters are present in any term, which might need to be escaped if the terms are used
68		#' in regular expressions. Glob-type terms allow complete parens (at least one open and one
69		#' closed, indicating preceding or following words), and initial and terminal asterisks. For
70		#' all other terms, \code{[](){}*.^$+?\\|} are counted as regex characters. These could be
71		#' escaped in R with \code{gsub('([][)(}{*.^$+?\\\\\|])', '\\\\\\1', terms)} if \code{terms}
72		#' is a character vector, and in Python with (importing re)
73		#' \code{[re.sub(r'([][(){}*.^$+?\\|])', r'\\\1', term)} \code{for term in terms]} if \code{terms}
74		#' is a list.
75		#' \item \strong{\code{categories}}: Category names in the order in which they appear in the dictionary
76		#' file, separated by commas.
77		#' \item \strong{\code{ncategories}}: Number of categories.
78		#' \item \strong{\code{original_max}}: Maximum value of the original dictionary before standardization:
79		#' \code{original values / max(original values) * 100}. Dictionaries with no weights are
80		#' considered to have a max of \code{1}.
81		#' \item \strong{\code{osf}}: ID of the file on OSF, translating to the file's URL:
82		#' https://osf.io/\code{osf}.
83		#' \item \strong{\code{wiki}}: URL of the dictionary's wiki.
84		#' \item \strong{\code{downloaded}}: Path to the file if downloaded, and \code{''} otherwise.
85		#' }
86		#' \item \strong{\code{selected}}: A subset of \code{info} selected by \code{query}.
87		#' }
88		#' @family Dictionary functions
89		#' @examples
90		#' # just retrieve information about available dictionaries
91		#' dicts <- select.dict()$info
92		#' dicts[1:10, 4:9]
93		#'
94		#' # select all dictionaries mentioning sentiment or emotion
95		#' sentiment_dicts <- select.dict("sentiment emotion")$selected
96		#' sentiment_dicts[1:10, 4:9]
97		#' @export
98
99		select.dict <- function(query = NULL, dir = getOption("lingmatch.dict.dir"),
100		check.md5 = TRUE, mode = "wb") {
101	6x	if (dir == "") dir <- "~/Dictionaries"
102	7x	r <- list(info = dict_info, selected = dict_info[NULL, ])
103	7x	r$info[, "wiki"] <- paste0("https://osf.io/y6g5b/wiki/", rownames(dict_info))
104	7x	r$info[, "downloaded"] <- normalizePath(paste0(
105	7x	dir, "/", rownames(r$info), ifelse(r$info$weighted, ".csv", ".dic")
106	7x	), "/", FALSE)
107	7x	r$info[!file.exists(r$info[, "downloaded"]), "downloaded"] <- ""
108	7x	if (!missing(query)) {
109	6x	query <- paste0(query, collapse = "\|")
110	6x	if (!length(sel <- grep(query, rownames(dict_info), TRUE))) {
111	2x	collapsed <- vapply(
112	2x	seq_len(nrow(dict_info)),
113	2x	function(r) paste(c(rownames(dict_info)[r], dict_info[r, ]), collapse = " "), ""
114		)
115	2x	if (!length(sel <- grep(query, collapsed, TRUE))) {
116	1x	sel <- grep(paste(strsplit(query, "[[:space:],\|]+")[[1]], collapse = "\|"), collapsed, TRUE)
117		}
118		}
119	5x	if (length(sel)) r$selected <- r$info[sel, , drop = FALSE]
120		}
121	7x	r
122		}

1		#' Download Latent Semantic Spaces
2		#'
3		#' Downloads the specified semantic space from \href{https://osf.io/489he}{osf.io/489he}.
4		#'
5		#' @param space Name of one or more spaces you want to download, or \code{'all'} for all available.
6		#' \code{'100k_lsa'} is the default, and some other common options might be \code{'google'}, \code{'facebook'},
7		#' or \code{'glove'}. See \href{https://osf.io/489he/wiki/home}{osf.io/489he/wiki} for more information,
8		#' and a full list of spaces.
9		#' @param decompress Logical; if \code{TRUE} (default), decompresses the downloaded file
10		#' with the \code{bunzip2} system command assuming it is available \cr (as indicated by
11		#' \code{Sys.which('bunzip2')}).
12		#' @param check.md5 Logical; if \code{TRUE} (default), retrieves the MD5 checksum from OSF,
13		#' and compares it with that calculated from the downloaded file to check its integrity.
14		#' @param mode A character specifying the file write mode; default is 'wb'. See
15		#' \code{\link{download.file}}.
16		#' @param dir Directory in which to save the space. Specify this here, or set the lspace directory option
17		#' (e.g., \code{options(lingmatch.lspace.dir = '~/Latent Semantic Spaces')}), or use
18		#' \code{\link{lma_initdirs}} to initialize a directory.
19		#' @param overwrite Logical; if \code{TRUE}, will replace existing files.
20		#' @family Latent Semantic Space functions
21		#' @return A character vector with paths to the [1] data and [2] term files.
22		#' @examples
23		#' \dontrun{
24		#'
25		#' download.lspace("glove_crawl", dir = "~/Latent Semantic Spaces")
26		#' }
27		#' @export
28		#' @importFrom utils download.file
29		#' @importFrom tools md5sum
30
31		download.lspace <- function(space = "100k_lsa", decompress = TRUE,
32		check.md5 = TRUE, mode = "wb", dir = getOption("lingmatch.lspace.dir"),
33		overwrite = FALSE) {
34	1x	download.resource(
35	1x	"lspace", space,
36	1x	decompress = decompress, check.md5 = check.md5, mode = mode, dir = dir, overwrite = overwrite
37		)
38		}

1		#include <Rcpp.h>
2		using namespace std;
3		using namespace Rcpp;
4
5		/**
6		* @brief Converts tokenized texts to indices, as part of document-term matrix
7		* creation.
8		*
9		* @param tokens A list of character vectors; tokenized texts.
10		* @param terms A character vector containing all terms.
11		* @param isword A logical vector indicating whether each term is a word or.
12		* punctuation mark (and so should count toward word-count).
13		* @param dim Dimensions of the output matrix [n rows, n cols].
14		* @param complete Logical indicating whether to all terms tokens should be
15		* counted, or only those in `terms` should count.
16		* @param tokensonly Logical specifying the return type.
17		* @return List; a term map with row indices (if `tokensonly`), or a sparse
18		* matrix.
19		*/
20
21		// [[Rcpp::export]]
22	77x	List match_terms(const List &tokens, const CharacterVector &terms,
23		const LogicalVector &isword, const IntegerVector &dim,
24		const bool &complete, const bool &tokensonly) {
25	77x	const int n = tokens.length();
26	77x	int i = terms.length(), colindex, s = 0, un, p, ck = 1e3;
27	308x	vector<int> rows, columns, rowsums(n), colsums(i);
28	77x	vector<double> counts;
29	77x	CharacterVector uses;
30	77x	unordered_map<String, int> dict;
31	15081x	for (; i--;) dict.insert({terms[i], i});
32	77x	if (tokensonly) {
33	32x	for (; s < n; s++) {
34	30x	uses = tokens[s];
35	1230x	for (un = uses.length(), p = 0; p < un; p++)
36	1200x	if (complete \|\| dict.find(uses[p]) != dict.end()) {
37	1178x	colindex = dict.at(uses[p]);
38	1178x	rows.push_back(colindex);
39	1178x	colsums[colindex]++;
40	1178x	rowsums[s]++;
41		}
42	30x	if (!--ck) {
43	!	checkUserInterrupt();
44	!	ck = 1e3;
45		}
46		}
47	2x	return List::create(dict, colsums, rowsums, rows);
48		} else {
49	1301x	for (; s < n; s++) {
50	1226x	uses = tokens[s];
51	63612x	for (un = uses.length(), p = 0; p < un; p++)
52	62386x	if (complete \|\| dict.find(uses[p]) != dict.end()) {
53	62355x	colindex = dict.at(uses[p]);
54	62355x	colsums[colindex]++;
55	62355x	if (isword[colindex]) rowsums[s]++;
56	62355x	if (p == 0 \|\| uses[p] != uses[p - 1]) {
57	62021x	rows.push_back(s);
58	62021x	columns.push_back(colindex);
59	62021x	counts.push_back(1);
60	62021x	i++;
61		} else
62	334x	counts[i]++;
63		}
64	1226x	if (!--ck) {
65	!	checkUserInterrupt();
66	!	ck = 1e3;
67		}
68		}
69	150x	S4 dtm("dgTMatrix");
70	75x	dtm.slot("Dim") = dim;
71	225x	dtm.slot("Dimnames") = List::create(R_NilValue, terms);
72	150x	dtm.slot("i") = rows;
73	150x	dtm.slot("j") = columns;
74	75x	dtm.slot("x") = counts;
75	75x	return List::create(dtm, rowsums, colsums);
76		}
77		}

1		// [[Rcpp::depends(BH)]]
2		#include <Rcpp.h>
3
4		#include <fstream>
5		#define BOOST_REGEX_USE_C_LOCALE
6		#include <boost/regex.hpp>
7		using namespace std;
8		using namespace Rcpp;
9
10		/**
11		* @brief Standardizes embeddings.
12		*
13		* @param infile Path to original embeddings file.
14		* @param outfile Base path to reformatted files, which will translate to a
15		* `.dat` and `_terms.txt` file.
16		* @param sep Separating character(s).
17		* @param digits Number of digits to round to.
18		* @param remove String to remove to any terms.
19		* @param term_check Regex pattern used to check terms for inclusion.
20		* @param verbose Logical indicating whither to print every thousandth line
21		* during reformatting.
22		* @return void; writes output files.
23		*/
24
25		// [[Rcpp::export]]
26	2x	void reformat_embedding(const std::string &infile, const std::string &outfile,
27		const char &sep = ' ', const int &digits = 9,
28		const std::string &remove = "",
29		const std::string &term_check =
30		"^[a-zA-Z]+$\|^['a-zA-Z][a-zA-Z.'\\/-]*[a-zA-Z.]$",
31		const bool &verbose = false) {
32	2x	ifstream d(infile);
33	2x	ofstream o(outfile + ".dat"), cn(outfile + "_terms.txt");
34	2x	o << setprecision(digits) << fixed;
35	2x	int n, i, ln, cl = 0, ck = 1e3;
36	2x	const string num = "-.0123456789";
37	2x	bool start = true, filter = term_check != "";
38	2x	boost::regex ckterm(term_check, boost::regex_constants::optimize),
39	2x	rm(remove, boost::regex_constants::optimize);
40	2x	std::string line, term, value;
41	202x	for (; getline(d, line);) {
42	1420x	for (cl++, term = "", n = line.length(), i = 0; i < n; i++) {
43	1420x	if (line[i] == sep) break;
44	1220x	term.push_back(line[i]);
45		}
46		if (remove != "") term = boost::regex_replace(term, rm, "");
47	400x	if (term != "" && n > 100 && i++ < n &&
48	590x	(!filter \|\| boost::regex_match(term, ckterm)) &&
49	190x	num.find(line[i]) != string::npos)
50		try {
51	190x	ln = line.length();
52	190x	if (ln) {
53	190x	cn << term << endl;
54	1150524x	for (start = true, value = ""; i < ln; i++) {
55	1150334x	if (line[i] == sep) {
56	56810x	if (value != "") {
57	56810x	if (start) {
58	190x	o << atof(value.c_str());
59	190x	start = false;
60		} else
61	56620x	o << ' ' << atof(value.c_str());
62	56810x	value = "";
63		}
64		} else
65	1093524x	value.push_back(line[i]);
66		}
67		value == "" ? o << endl : o << ' ' << atof(value.c_str()) << endl;
68		}
69		} catch (const std::exception &e) {
70		Rcout << line << endl;
71		}
72	200x	if (!--ck) {
73	!	checkUserInterrupt();
74	!	if (verbose) Rcout << "line " << cl << ": " << term << endl;
75	!	ck = 1e3;
76		}
77		}
78		}
79
80		/**
81		* @brief Read embeddings vectors based on term indices.
82		*
83		* @param indices Integer vector containing the indices of the vectors to load.
84		* @param file File containing vectors.
85		* @param sep Separator between values.
86		* @return NumericVector; term by dimension matrix.
87		*/
88
89		// [[Rcpp::export]]
90	18x	NumericVector extract_indices(const IntegerVector &indices,
91		const std::string &file, const char &sep = ' ') {
92	18x	int nr = indices.length(), ck = 1e3, nc = 0, c = 0, t = 0, p = 0, ln, i;
93	18x	std::string line, value;
94	18x	ifstream d(file);
95	49307x	for (getline(d, line), t = p = line.length(), value = ""; p--;) {
96	49289x	if (line[p] == sep) {
97	5182x	if (p != t - 1) nc++;
98	44107x	} else if (!nc)
99	155x	value.insert(value.begin(), line[p]);
100		}
101		nc == 1 ? nc = atoi(value.c_str()) : nc++;
102	18x	IntegerVector dims = {nc, nr};
103	18x	NumericVector r(nc * nr);
104	18x	if (indices[0] == 1) {
105	8x	d.seekg(0, d.beg);
106	8x	p = 0;
107		} else
108	10x	p = 1;
109	1238496x	for (t = 0; t < nr;) {
110	1238478x	if (++p == indices[t]) {
111	310764x	getline(d, line);
112	363189637x	for (value = "", ln = line.length(), c = 0, i = 0; i < ln; i++) {
113	362878873x	if (line[i] == sep) {
114	37437236x	r[t * nc + c++] = atof(value.c_str());
115	37437236x	value = "";
116		} else
117	325441637x	value.push_back(line[i]);
118		}
119	310764x	r[t++ * nc + c] = atof(value.c_str());
120		} else
121	927714x	d.ignore(numeric_limits<streamsize>::max(), '\n');
122	1238478x	if (!--ck) {
123	1231x	checkUserInterrupt();
124	1231x	ck = 1e3;
125		}
126		}
127	18x	d.close();
128	18x	r.attr("dim") = dims;
129	36x	return r;
130		}

1		// [[Rcpp::depends(RcppParallel)]]
2		#include <Rcpp.h>
3		#include <RcppParallel.h>
4		using namespace std;
5		using namespace Rcpp;
6		using namespace RcppParallel;
7
8		/**
9		* @brief Calculate similarity between two vectors.
10		*
11		* @param a First numeric vector.
12		* @param b Second numeric vector.
13		* @param metrics Binary integer vector indicating which metrics to calculate
14		* ([Jaccard, Euclidean, Canberra, Cosine, Pearson's]).
15		* @return NumericVector; each requested similarity value.
16		*/
17
18		// [[Rcpp::export]]
19	428x	NumericVector vector_similarity(NumericVector &a, NumericVector &b,
20		const IntegerVector &metrics) {
21	428x	const int n = a.length();
22	428x	bool ck = true;
23	428x	NumericVector op;
24	428x	LogicalVector su = is_na(a) \| is_na(b) \| ((a == 0) & (b == 0));
25	428x	if (is_true(all(su)))
26	!	ck = false;
27	428x	else if (is_true(any(su))) {
28	145x	a = a[!su];
29	145x	b = b[!su];
30		}
31	428x	if (metrics[0]) {
32		op.push_back(ck ? (float)sum((a != 0) & (b != 0)) / sum((a != 0) \| (b != 0))
33		: NA_REAL,
34		"jaccard");
35		}
36	428x	if (metrics[1])
37		op.push_back(ck ? 1 / (1 + sqrt(sum(pow(a - b, 2)))) : 1, "euclidean");
38	428x	if (metrics[2])
39		op.push_back(ck ? 1 - sum(abs(a - b) / (abs(a) + abs(b))) / n : NA_REAL,
40		"canberra");
41	428x	if (metrics[3]) {
42	606x	op.push_back(
43		ck ? sum(a * b) / sqrt(sum(pow(a, 2) * sum(pow(b, 2)))) : NA_REAL,
44		"cosine");
45	610x	if (isnan((float)op["cosine"])) op["cosine"] = NA_REAL;
46		}
47	428x	if (metrics[4]) {
48	27x	const double ma = sum(a) / n, mb = sum(b) / n;
49		op.push_back(ck ? (sum(a * b) / n - (ma * mb)) /
50	27x	sqrt(sum(pow(a, 2)) / n - pow(ma, 2)) /
51	54x	sqrt(sum(pow(b, 2)) / n - pow(mb, 2))
52		: NA_REAL,
53		"pearson");
54	58x	if (isnan((float)op["pearson"])) op["pearson"] = NA_REAL;
55		}
56	856x	return op;
57		}
58
59		class Sparse_Arrays {
60		public:
61		const NumericVector values;
62		const IntegerVector dims, i, p;
63		const int n = 0, ncol = 0;
64		IntegerVector row_starts, row_maps, columns;
65	385x	Sparse_Arrays(const S4 &m)
66	1155x	: values(m.slot("x")),
67	770x	dims(m.slot("Dim")),
68	770x	i(m.slot("i")),
69	385x	p(m.slot("p")),
70	385x	n(values.length()),
71	385x	ncol(dims[1]),
72	385x	row_starts(IntegerVector(dims[0], -1)),
73	385x	row_maps(IntegerVector(n, -1)),
74	385x	columns(IntegerVector(n)) {
75	385x	int index = 0, r = 0, c = p[0];
76	385x	unordered_map<int, int> row_steps;
77	132229890x	for (; index < n; index++) {
78	132229505x	r = i[index];
79	132229505x	if (row_starts[r] == -1) {
80	1040651x	row_starts[r] = index;
81	1040651x	row_steps.insert({r, index});
82		} else {
83	131188854x	row_maps[row_steps.at(r)] = index;
84	131188854x	row_steps.at(r) = index;
85		}
86	132229505x	if (c + 1 < ncol && index == p[c + 1]) {
87	49383x	c++;
88	55472x	while (c + 1 < ncol && p[c] == p[c + 1]) c++;
89		}
90	132229505x	columns[index] = c;
91		}
92		}
93		};
94
95		struct Compare : public Worker {
96		const Sparse_Arrays a, b;
97		const int n, ncol, type, nmetrics = 5;
98		RVector<int> aind, bind, metrics;
99		RVector<double> jaccard, euclidean, canberra, cosine, pearson;
100	231x	Compare(const Sparse_Arrays &a, const Sparse_Arrays &b,
101		const IntegerVector &i, const IntegerVector &j, const int &type,
102		const IntegerVector &metrics,
103		unordered_map<String, NumericVector> &out)
104	462x	: a(a),
105	231x	b(b),
106	231x	n(a.n),
107	231x	ncol(a.ncol),
108	231x	type(type),
109	231x	aind(i),
110	231x	bind(j),
111	231x	metrics(metrics),
112	231x	jaccard(out.at("jaccard")),
113	231x	euclidean(out.at("euclidean")),
114	231x	canberra(out.at("canberra")),
115	231x	cosine(out.at("cosine")),
116	693x	pearson(out.at("pearson")) {}
117	137626x	void operator()(size_t p, size_t final) {
118		int r, comp, i, c, bi, bc, l, col;
119		double x, bx, dif, sa, sb, sdif, sadif, sse, sne, cp, asq, bsq, sj, si, ma,
120		mb;
121	2936314x	for (; p < final; p++) {
122	2799259x	r = aind[p];
123	2795707x	comp = bind[p];
124	2796901x	i = a.row_starts[r];
125	2779528x	c = i == -1 ? -1 : a.columns[i];
126	2774633x	bi = b.row_starts[comp];
127	2771176x	bc = bi == -1 ? -1 : b.columns[bi];
128	2767589x	x = bx = sa = sb = sdif = sadif = sse = sne = cp = asq = bsq = sj = si =
129		0;
130	234284358x	for (l = ncol; l--;) {
131	263110273x	col = c;
132	263110273x	if (i == -1 \|\| (bc != -1 && c > bc)) {
133	754610x	x = 0;
134	754610x	if (i == -1) c = -1;
135		} else {
136	262355663x	x = a.values[i];
137	163406580x	i = a.row_maps[i];
138	135819982x	c = i == -1 ? -1 : a.columns[i];
139		if (NumericVector::is_na(x)) continue;
140		}
141	118446667x	if (bi == -1 \|\| (col != -1 && bc > col)) {
142	578863x	bx = 0;
143	578863x	if (bi == -1) bc = -1;
144		} else {
145	117867804x	bx = b.values[bi];
146	123909872x	bi = b.row_maps[bi];
147	120818819x	bc = bi == -1 ? -1 : b.columns[bi];
148	108430728x	if (NumericVector::is_na(bx) \|\| bx + x == 0) continue;
149		}
150	116228658x	sa += x;
151	116228658x	sb += bx;
152	116228658x	dif = x - bx;
153	116228658x	sse += pow(dif, 2);
154	212765449x	dif = abs(dif);
155	226389827x	sdif += dif;
156	226389827x	if (x \|\| bx) sne += dif / (abs(x) + abs(bx));
157	222475667x	cp += x * bx;
158	222475667x	asq += pow(x, 2);
159	223499606x	bsq += pow(bx, 2);
160	234315438x	sj += x && bx;
161	234315438x	si += x \|\| bx;
162	234315438x	if (i == -1 && bi == -1) break;
163		}
164	!	ma = sa / ncol;
165	!	mb = sb / ncol;
166	!	if (si) {
167	2802775x	if (metrics[0]) jaccard[p] = sj / si;
168	2796600x	if (metrics[2]) canberra[p] = 1 - sne / ncol;
169		}
170	!	if (metrics[1]) euclidean[p] = 1 / (1 + sqrt(sse));
171	2797431x	if (sa && sb) {
172	2797713x	if (metrics[3] && asq && bsq) cosine[p] = cp / sqrt(asq) / sqrt(bsq);
173	2800478x	if (metrics[4]) {
174	20398x	x = (cp / ncol - ma * mb) / sqrt(asq / ncol - pow(ma, 2)) /
175	19960x	sqrt(bsq / ncol - pow(mb, 2));
176		pearson[p] = !isnan(x) ? x : NA_REAL;
177		}
178		}
179		}
180		}
181		};
182
183		/**
184		* @brief Calculate similarities between vectors within two matrices.
185		*
186		* @param m Primary sparse matrix.
187		* @param comp The second sparse matrix, or NULL if pairwise comparisons within
188		* `m` are to be made.
189		* @param type Integer indicating which type of comparison to make: 1 = `m` row
190		* to specific `comp` row, 2 = all `m` rows to `m` rows, 3 = all `m` rows to all
191		* `comp` rows.
192		* @param metrics Binary integer vector indicating which metrics to calculate.
193		* @return List; a sparse matrix containing similarities for each requested
194		* metric.
195		*/
196
197		// [[Rcpp::export]]
198	231x	List calculate_similarities(const S4 &m, const RObject &comp, int &type,
199		const IntegerVector &metrics) {
200	231x	const bool procb = type != 2 && comp.isS4();
201	231x	Sparse_Arrays a(m), b = procb ? Sparse_Arrays(as<S4>(comp)) : a;
202	231x	const IntegerVector dim{a.dims[0], b.dims[0]};
203	231x	size_t nrow = b.dims[0], ai = 1, bi = 0, index = metrics.length();
204	357x	const size_t arow = a.dims[0], n = type == 1 ? a.dims[0]
205	126x	: type == 2 ? arow * (arow - 1) / 2
206	231x	: arow * nrow;
207		// setting up output vectors
208		const CharacterVector metric_names{"jaccard", "euclidean", "canberra",
209	231x	"cosine", "pearson"};
210	231x	unordered_map<String, NumericVector> res;
211	1386x	for (; index--;)
212	1155x	res.insert({metric_names[index], NumericVector(metrics[index] ? n : 0)});
213		// mapping out row-row comparisons
214	231x	IntegerVector aind(n), bind(n), column_starts(nrow + 1);
215	231x	column_starts[nrow] = n;
216	231x	switch (type) {
217	105x	case 1: // each a row to a single b row or paired b rows
218	105x	aind = seq_len(n) - 1;
219	105x	if (nrow == n) bind = aind;
220	105x	break;
221	77x	case 2: // pairwise between a rows
222	60002x	for (index = 0; index < n; index++) {
223	59925x	aind[index] = ai;
224	59925x	bind[index] = bi;
225	59925x	if (++ai == nrow) {
226	2142x	column_starts[++bi] = index + 1;
227	2142x	ai = bi + 1;
228		}
229		}
230	77x	break;
231	49x	case 3: // each a row with each b row
232	49x	ai = 0, nrow = a.dims[0], index = 0;
233	2343945x	for (; index < n; index++) {
234	2343896x	aind[index] = ai;
235	2343896x	bind[index] = bi;
236	2343896x	if (++ai == nrow) {
237	447x	column_starts[++bi] = index + 1;
238	447x	ai = 0;
239		}
240		}
241	49x	break;
242		}
243		// making comparisons
244	462x	Compare sims(a, b, aind, bind, type, metrics, res);
245	231x	checkUserInterrupt();
246	231x	parallelFor(0, n, sims);
247		// formatting output
248	231x	if (a.dims[0] == 1) type = 1;
249	231x	List op;
250	462x	NumericVector sim;
251	1386x	for (const String met : metric_names) {
252	1155x	sim = res.at(met);
253	1155x	if (sim.length()) {
254	307x	if (type == 1) {
255	176x	op.push_back(sim, met);
256		} else {
257	262x	S4 simsm(type == 2 ? "dtCMatrix" : "dgCMatrix");
258	262x	simsm.slot("Dim") = dim;
259	131x	List dimnames(m.slot("Dimnames")),
260		newdimnames{dimnames[0],
261	131x	dim[1] == dim[0] ? dimnames[0] : R_NilValue};
262	131x	if (procb) {
263	46x	List dimnames2(comp.slot("Dimnames"));
264	46x	newdimnames[1] = dimnames2[0];
265		}
266	262x	simsm.slot("Dimnames") = newdimnames;
267	262x	simsm.slot("i") = aind;
268	262x	simsm.slot("p") = column_starts;
269	131x	simsm.slot("x") = sim;
270	131x	if (type == 2) {
271	170x	simsm.slot("uplo") = "L";
272	170x	simsm.slot("diag") = "U";
273		}
274	131x	op.push_back(simsm, met);
275		}
276		}
277		}
278	462x	return op;
279		}