Merge branch 'master' of https://github.com/quanteda/quanteda.textstats

quanteda · Sep 4, 2024 · 04a2cdf · 04a2cdf
2 parents 18d216a + d16ec48
commit 04a2cdf
Show file tree

Hide file tree

Showing 84 changed files with 12,792 additions and 203 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -9,3 +9,4 @@
 ^revdep$
 ^CRAN-SUBMISSION$
 ^codecov\.yml$
+^docs$
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,17 @@
+# Fix GitHub Linguist language detection
+# docs/ and man/ are automatically excluded
+vignettes/*       linguist-documentation
+tests/**/*.html   linguist-generated
+
+# Source files
+# ============
+*.Rdata     text
+*.rdb       binary
+*.rds       binary
+*.Rd        text
+*.Rdx       binary
+*.Rmd       text
+*.R         text
+
+# Fix for R checks
+configure.ac text eol=lf
diff --git a/.github/workflows/R-CMD-check.yaml → .github/workflows/check-standard.yaml b/.github/workflows/R-CMD-check.yaml → .github/workflows/check-standard.yaml
@@ -27,8 +27,17 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       R_KEEP_PKG_SOURCE: yes
-
+    
     steps:
+      - if: matrix.config.os == 'ubuntu-latest'
+        run: |
+            sudo apt update
+            sudo apt install libtbb-dev
+      - if: matrix.config.os == 'macos-latest'
+        run: |
+            brew update
+            brew install tbb
+        
       - uses: actions/checkout@v3
 
       - uses: r-lib/actions/setup-pandoc@v2
@@ -46,5 +55,5 @@ jobs:
 
       - uses: r-lib/actions/check-r-package@v2
         with:
-          upload-snapshots: true 
-
+          args: 'c("--no-vignettes", "--no-manual", "--as-cran")'
+          upload-snapshots: true
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -15,7 +15,7 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/setup-r@v2
         with:
@@ -39,12 +39,12 @@ jobs:
         if: always()
         run: |
           ## --------------------------------------------------------------------
-          find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
+          find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
         shell: bash
 
       - name: Upload test results
         if: failure()
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: coverage-test-failures
           path: ${{ runner.temp }}/package
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: quanteda.textstats
-Version: 0.96.5
+Version: 0.97.3
 Title: Textual Statistics for the Quantitative Analysis of Textual Data
 Description: Textual statistics functions formerly in the 'quanteda' package.
     Textual statistics for characterizing and comparing textual data. Includes 
@@ -19,15 +19,14 @@ License: GPL-3
 Depends:
     R (>= 3.5.0)
 Imports:
-    quanteda,
+    quanteda (>= 4.0.0),
     Matrix (>= 1.5-0),
     methods,
     nsyllable,
     proxyC (>= 0.1.4),
     Rcpp (>= 0.12.12),
-    RcppParallel,
     stringi
-LinkingTo: Rcpp, RcppParallel, RcppArmadillo (>= 0.7.600.1.0), quanteda
+LinkingTo: Rcpp, RcppArmadillo (>= 0.7.600.1.0), quanteda
 Suggests:
     entropy,
     ExPosition,
@@ -43,5 +42,5 @@ Encoding: UTF-8
 BugReports: https://github.com/quanteda/quanteda.textstats/issues
 LazyData: TRUE
 Language: en-GB
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -53,7 +53,6 @@ exportMethods(show)
 import(Matrix)
 import(methods)
 importFrom(Rcpp,evalCpp)
-importFrom(RcppParallel,RcppParallelLibs)
 importFrom(nsyllable,nsyllable)
 importFrom(quanteda,as.corpus)
 importFrom(quanteda,as.dfm)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# quanteda.textstats 0.97
+
+* Fixes Rd link issues and other issues causing warnings under the new and improved CRAN checks.
+
 # quanteda.textstats 0.96
 
 * Fixes for C++ header compatibility for existing **quanteda** 3.x and the forthcoming 4.0 version.

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,11 +1,11 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-cpp_collocations <- function(texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread = 1L) {
+cpp_collocations <- function(texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread = -1L) {
     .Call(`_quanteda_textstats_cpp_collocations`, texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread)
 }
 
-cpp_keyness <- function(mt, measure, correct, thread = 1L) {
+cpp_keyness <- function(mt, measure, correct, thread = -1L) {
     .Call(`_quanteda_textstats_cpp_keyness`, mt, measure, correct, thread)
 }
 
diff --git a/R/nsyllable-methods.R b/R/nsyllable-methods.R
@@ -1,6 +1,6 @@
 #' nsyllable methods for tokens
 #'
-#' Extends `nsyllable()` methods for [quanteda.textstats::tokens] objects.
+#' Extends `nsyllable()` methods for [tokens][quanteda::tokens] objects.
 #' @inheritParams nsyllable::nsyllable
 #' @examples
 #' \dontshow{

diff --git a/R/textstat_collocations.R b/R/textstat_collocations.R
@@ -4,16 +4,17 @@
 #' collocations, from text.
 #'
 #' Documents are grouped for the purposes of scoring, but collocations will not
-#' span sentences. If `x` is a [tokens] object and some tokens have been
-#' removed, this should be done using `[tokens_remove](x, pattern, padding =
-#' TRUE)` so that counts will still be accurate, but the pads will prevent those
-#' collocations from being scored.
-#' @param x a character, [corpus], or [tokens] object whose collocations will be
-#'   scored.  The tokens object should include punctuation, and if any words
-#'   have been removed, these should have been removed with `padding = TRUE`.
-#'   While identifying collocations for tokens objects is supported, you will
-#'   get better results with character or corpus objects due to relatively
-#'   imperfect detection of sentence boundaries from texts already tokenized.
+#' span sentences. If `x` is a [tokens][quanteda::tokens] object and some tokens
+#' have been removed, this should be done using `[tokens_remove](x, pattern,
+#' padding = TRUE)` so that counts will still be accurate, but the pads will
+#' prevent those collocations from being scored.
+#' @param x a character, [corpus][quanteda::corpus], or
+#'   [tokens][quanteda::tokens] object whose collocations will be scored.  The
+#'   tokens object should include punctuation, and if any words have been
+#'   removed, these should have been removed with `padding = TRUE`. While
+#'   identifying collocations for tokens objects is supported, you will get
+#'   better results with character or corpus objects due to relatively imperfect
+#'   detection of sentence boundaries from texts already tokenized.
 #' @param method association measure for detecting collocations. Currently this
 #'   is limited to `"lambda"`.  See Details.
 #' @param size integer; the length of the collocations
@@ -24,7 +25,7 @@
 #'   (default is 0.5)
 #' @param tolower logical; if `TRUE`, form collocations as lower-cased
 #'   combinations
-#' @param ... additional arguments passed to [tokens()]
+#' @param ... additional arguments passed to [tokens()][quanteda::tokens]
 #' @references Blaheta, D. & Johnson, M. (2001). [Unsupervised learning of
 #'   multi-word
 #'   verbs](http://web.science.mq.edu.au/~mjohnson/papers/2001/dpb-colloc01.pdf).
@@ -160,7 +161,7 @@ textstat_collocations.tokens <- function(x, method = "lambda",
     if (is.null(id_ignore)) id_ignore <- integer()
     result <- cpp_collocations(x, types, id_ignore, min_count, size,
                                if (method == "lambda1") "lambda1" else "lambda",
-                               smoothing, get_threads())
+                               smoothing, quanteda:::get_threads())
 
     # compute z for lambda methods
     result$z <- result$lambda / result$sigma

diff --git a/R/textstat_frequency.R b/R/textstat_frequency.R
@@ -1,17 +1,17 @@
 #' Tabulate feature frequencies
 #'
 #' Produces counts and document frequencies summaries of the features in a
-#' [dfm], optionally grouped by a [docvars] variable or other supplied
-#' grouping variable.
-#' @param x a [dfm] object
+#' [dfm][quanteda::dfm], optionally grouped by a [docvars][quanteda::docvars]
+#' variable or other supplied grouping variable.
+#' @param x a [dfm][quanteda::dfm] object
 #' @param n (optional) integer specifying the top `n` features to be returned,
 #' within group if `groups` is specified
 #' @param ties_method character string specifying how ties are treated.  See
 #'   [base::rank()] for details.  Unlike that function, however, the default is
 #'   `"min"`, so that frequencies of 10, 10, 11 would be ranked 1, 1, 3.
-#' @param ... additional arguments passed to [dfm_group()].  This can
-#'   be useful in passing `force = TRUE`, for instance, if you are grouping a
-#'   dfm that has been weighted.
+#' @param ... additional arguments passed to [dfm_group()][quanteda::dfm_group].
+#'   This can be useful in passing `force = TRUE`, for instance, if you are
+#'   grouping a dfm that has been weighted.
 #' @inheritParams quanteda::groups
 #' @return a data.frame containing the following variables:
 #' \describe{

diff --git a/R/textstat_keyness.R b/R/textstat_keyness.R
@@ -2,9 +2,10 @@
 #'
 #' Calculate "keyness", a score for features that occur differentially across
 #' different categories.  Here, the categories are defined by reference to a
-#' "target" document index in the [dfm], with the reference group
+#' "target" document index in the [dfm][quanteda::dfm], with the reference group
 #' consisting of all other documents.
-#' @param x a [dfm] containing the features to be examined for keyness
+#' @param x a [dfm][quanteda::dfm] containing the features to be examined for
+#'   keyness
 #' @param target the document index (numeric, character or logical) identifying
 #'   the document forming the "target" for computing keyness; all other
 #'   documents' feature frequencies will be combined for use as a reference
@@ -148,7 +149,7 @@ textstat_keyness.dfm <- function(x, target = 1L, measure = c("chi2", "exact", "l
             warning("correction is always none for pmi")
         result <- data.frame(
             feature = featnames(temp),
-            stat = cpp_keyness(temp, measure, correction, get_threads()),
+            stat = cpp_keyness(temp, measure, correction, quanteda:::get_threads()),
             p = NA,
             n_target = as.vector(temp[1, ]),
             n_reference = as.vector(temp[2, ]),

diff --git a/R/textstat_lexdiv.R b/R/textstat_lexdiv.R
@@ -74,8 +74,8 @@
 #'   are ignored.}
 #'   }
 #'
-#' @param x an [dfm] or [tokens] input object for whose documents
-#'   lexical diversity will be computed
+#' @param x an [dfm][quanteda::dfm] or [tokens][quanteda::tokens] input object
+#'   for whose documents lexical diversity will be computed
 #' @param measure a character vector defining the measure to compute
 #' @param remove_numbers logical; if `TRUE` remove features or tokens that
 #'   consist only of numerals (the Unicode "Number" `[N]` class)
@@ -280,7 +280,7 @@ textstat_lexdiv.tokens <-
 #' @description
 #' Internal functions used in [textstat_lexdiv()], for computing
 #' lexical diversity measures on dfms or tokens objects
-#' @param x a [dfm] object
+#' @param x a [dfm][quanteda::dfm] object
 #' @param measure a list of lexical diversity measures.
 #' @return a `data.frame` with a `document` column containing the
 #'   input document name, followed by columns with the lexical diversity
@@ -293,7 +293,7 @@ NULL
 #' @param log.base a numeric value defining the base of the logarithm (for
 #'   measures using logs)
 #' @details `compute_lexdiv_dfm_stats` in an internal function that
-#'   computes the lexical diversity measures from a [dfm] input.
+#'   computes the lexical diversity measures from a [dfm][quanteda::dfm] input.
 #' @importFrom quanteda ntoken ntype docnames
 compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {
 
@@ -376,7 +376,7 @@ compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {
 
 #' @rdname compute_lexdiv_stats
 #' @details `compute_lexdiv_tokens_stats` in an internal function that
-#'   computes the lexical diversity measures from a [dfm] input.
+#'   computes the lexical diversity measures from a [dfm][quanteda::dfm] input.
 #' @param MATTR_window a numeric value defining the size of the moving window
 #'   for computation of the Moving-Average Type-Token Ratio (Covington & McFall, 2010)
 #' @param MSTTR_segment a numeric value defining the size of the each segment
@@ -407,7 +407,7 @@ compute_lexdiv_tokens_stats <- function(x, measure = c("MATTR", "MSTTR"),
 #' from Covington & McFall (2010), averaging all of the sequential moving
 #' windows of tokens of size `MATTR_window` across the text, returning the
 #' average as the MATTR.
-#' @param x a [tokens] object
+#' @param x a [tokens][quanteda::tokens] object
 #' @param MATTR_window integer; the size of the moving window for computation of
 #'   TTR, between 1 and the number of tokens of the document
 #' @keywords internal textstat lexdiv
@@ -434,7 +434,7 @@ compute_mattr <- function(x, MATTR_window = 100L) {
 #' Compute the Mean Segmental Type-Token Ratio (MSTTR)
 #'
 #' Compute the Mean Segmental Type-Token Ratio (Johnson 1944) for a tokens input.
-#' @param x input [tokens]
+#' @param x input [tokens][quanteda::tokens]
 #' @inheritParams textstat_lexdiv
 #' @keywords internal textstat lexdiv
 compute_msttr <- function(x, MSTTR_segment) {
@@ -464,7 +464,7 @@ compute_msttr <- function(x, MSTTR_segment) {
 #' Takes a dfm that contains features with hyphenated words, such as
 #' "split-second" and turns them into features that split the elements
 #' in the same was as `tokens(x, remove_hyphens = TRUE)` would have done.
-#' @param x input [dfm]
+#' @param x input [dfm][quanteda::dfm]
 #' @keywords internal dfm
 #' @importFrom quanteda featnames tokens dfm_compress
 #' @importFrom stringi stri_detect_regex

diff --git a/R/textstat_readability.R b/R/textstat_readability.R
@@ -123,8 +123,8 @@
 #'   \item{`"Farr.Jenkins.Paterson"`:}{Farr-Jenkins-Paterson's
 #'   Simplification of Flesch's Reading Ease Score (Farr, Jenkins and Paterson 1951). \deqn{
 #'    -31.517 - (1.015 \times ASL) + (1.599 \times
-#'   \frac{n_{wsy=1}}{n_{w}}}{ -31.517
-#'   - (1.015 * ASL) + (1.599 * Nwsy1 / Nw)}
+#'   \frac{n_{wsy=1}}{n_{w}})}{ -31.517
+#'   - (1.015 * ASL) + (1.599 * Nwsy1 / Nw )}
 #'
 #'   where \eqn{n_{wsy=1}} = Nwsy1 = the number of one-syllable words.}
 #'
@@ -316,7 +316,8 @@
 #'
 #' }
 #'
-#' @param x a character or [corpus] object containing the texts
+#' @param x a character or [corpus][quanteda::corpus] object containing the
+#'   texts
 #' @param measure character vector defining the readability measure to calculate.
 #'   Matches are case-insensitive.  See other valid measures under Details.
 #' @param remove_hyphens if `TRUE`, treat constituent words in hyphenated as
@@ -330,7 +331,8 @@
 #'   other cruft that might be in the texts following conversion.
 #'
 #'   For finer-grained control, consider filtering sentences prior first,
-#'   including through pattern-matching, using [corpus_trim()].
+#'   including through pattern-matching, using
+#'   [corpus_trim()][quanteda::corpus_trim].
 #' @param intermediate if `TRUE`, include intermediate quantities in the output
 #' @param ... not used
 #' @importFrom quanteda texts char_trim nsentence char_tolower tokens_remove dfm

diff --git a/R/textstat_simil.R b/R/textstat_simil.R
@@ -182,13 +182,14 @@ setMethod("tail", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
 #' Similarity and distance computation between documents or features
 #'
 #' These functions compute matrixes of distances and similarities between
-#' documents or features from a [dfm()] and return a matrix of
-#' similarities or distances in a sparse format.  These methods are fast
-#' and robust because they operate directly on the sparse [dfm] objects.
-#' The output can easily be coerced to an ordinary matrix, a data.frame of
-#' pairwise comparisons, or a [dist][stats::dist] format.
-#' @param x,y a [dfm] objects; `y` is an optional target matrix matching
-#'   `x` in the margin on which the similarity or distance will be computed.
+#' documents or features from a [dfm][quanteda::dfm] and return a matrix of
+#' similarities or distances in a sparse format.  These methods are fast and
+#' robust because they operate directly on the sparse [dfm][quanteda::dfm]
+#' objects. The output can easily be coerced to an ordinary matrix, a data.frame
+#' of pairwise comparisons, or a [dist][stats::dist] format.
+#' @param x,y a [dfm][quanteda::dfm] objects; `y` is an optional target matrix
+#'   matching `x` in the margin on which the similarity or distance will be
+#'   computed.
 #' @param selection (deprecated - use `y` instead).
 #' @param margin identifies the margin of the dfm on which similarity or
 #'   difference will be computed:  `"documents"` for documents or
@@ -355,7 +356,6 @@ textstat_simil.dfm <- function(x, y = NULL, selection = NULL,
 #' @details `textstat_dist` options are: `"euclidean"` (default),
 #'   `"manhattan"`, `"maximum"`, `"canberra"`,
 #'   and `"minkowski"`.
-#' @importFrom RcppParallel RcppParallelLibs
 #' @examples
 #'
 #' # distances for documents
@@ -644,8 +644,8 @@ setMethod("as.matrix", "textstat_simil_symm_sparse",
 #' This is an underlying function for `textstat_dist` and
 #' `textstat_simil` but returns `TsparseMatrix`.
 #' @keywords internal
-#' @param y if a [dfm] object is provided, proximity between documents or
-#'   features in `x` and `y` is computed.
+#' @param y if a [dfm][quanteda::dfm] object is provided, proximity between
+#'   documents or features in `x` and `y` is computed.
 #' @param use_na if `TRUE`, return `NA` for proximity to empty
 #'   vectors. Note that use of `NA` makes the proximity matrices denser.
 #' @inheritParams textstat_dist