Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
koheiw committed Sep 4, 2024
2 parents 18d216a + d16ec48 commit 04a2cdf
Show file tree
Hide file tree
Showing 84 changed files with 12,792 additions and 203 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
^revdep$
^CRAN-SUBMISSION$
^codecov\.yml$
^docs$
17 changes: 17 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Fix GitHub Linguist language detection
# docs/ and man/ are automatically excluded
vignettes/* linguist-documentation
tests/**/*.html linguist-generated

# Source files
# ============
*.Rdata text
*.rdb binary
*.rds binary
*.Rd text
*.Rdx binary
*.Rmd text
*.R text

# Fix for R checks
configure.ac text eol=lf
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,17 @@ jobs:
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
R_KEEP_PKG_SOURCE: yes

steps:
- if: matrix.config.os == 'ubuntu-latest'
run: |
sudo apt update
sudo apt install libtbb-dev
- if: matrix.config.os == 'macos-latest'
run: |
brew update
brew install tbb
- uses: actions/checkout@v3

- uses: r-lib/actions/setup-pandoc@v2
Expand All @@ -46,5 +55,5 @@ jobs:

- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true

args: 'c("--no-vignettes", "--no-manual", "--as-cran")'
upload-snapshots: true
6 changes: 3 additions & 3 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-r@v2
with:
Expand All @@ -39,12 +39,12 @@ jobs:
if: always()
run: |
## --------------------------------------------------------------------
find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
shell: bash

- name: Upload test results
if: failure()
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: coverage-test-failures
path: ${{ runner.temp }}/package
9 changes: 4 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: quanteda.textstats
Version: 0.96.5
Version: 0.97.3
Title: Textual Statistics for the Quantitative Analysis of Textual Data
Description: Textual statistics functions formerly in the 'quanteda' package.
Textual statistics for characterizing and comparing textual data. Includes
Expand All @@ -19,15 +19,14 @@ License: GPL-3
Depends:
R (>= 3.5.0)
Imports:
quanteda,
quanteda (>= 4.0.0),
Matrix (>= 1.5-0),
methods,
nsyllable,
proxyC (>= 0.1.4),
Rcpp (>= 0.12.12),
RcppParallel,
stringi
LinkingTo: Rcpp, RcppParallel, RcppArmadillo (>= 0.7.600.1.0), quanteda
LinkingTo: Rcpp, RcppArmadillo (>= 0.7.600.1.0), quanteda
Suggests:
entropy,
ExPosition,
Expand All @@ -43,5 +42,5 @@ Encoding: UTF-8
BugReports: https://github.com/quanteda/quanteda.textstats/issues
LazyData: TRUE
Language: en-GB
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Roxygen: list(markdown = TRUE)
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ exportMethods(show)
import(Matrix)
import(methods)
importFrom(Rcpp,evalCpp)
importFrom(RcppParallel,RcppParallelLibs)
importFrom(nsyllable,nsyllable)
importFrom(quanteda,as.corpus)
importFrom(quanteda,as.dfm)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# quanteda.textstats 0.97

* Fixes Rd link issues and other issues causing warnings under the new and improved CRAN checks.

# quanteda.textstats 0.96

* Fixes for C++ header compatibility for existing **quanteda** 3.x and the forthcoming 4.0 version.
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

cpp_collocations <- function(texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread = 1L) {
cpp_collocations <- function(texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread = -1L) {
.Call(`_quanteda_textstats_cpp_collocations`, texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread)
}

cpp_keyness <- function(mt, measure, correct, thread = 1L) {
cpp_keyness <- function(mt, measure, correct, thread = -1L) {
.Call(`_quanteda_textstats_cpp_keyness`, mt, measure, correct, thread)
}

2 changes: 1 addition & 1 deletion R/nsyllable-methods.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' nsyllable methods for tokens
#'
#' Extends `nsyllable()` methods for [quanteda.textstats::tokens] objects.
#' Extends `nsyllable()` methods for [tokens][quanteda::tokens] objects.
#' @inheritParams nsyllable::nsyllable
#' @examples
#' \dontshow{
Expand Down
25 changes: 13 additions & 12 deletions R/textstat_collocations.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
#' collocations, from text.
#'
#' Documents are grouped for the purposes of scoring, but collocations will not
#' span sentences. If `x` is a [tokens] object and some tokens have been
#' removed, this should be done using `[tokens_remove](x, pattern, padding =
#' TRUE)` so that counts will still be accurate, but the pads will prevent those
#' collocations from being scored.
#' @param x a character, [corpus], or [tokens] object whose collocations will be
#' scored. The tokens object should include punctuation, and if any words
#' have been removed, these should have been removed with `padding = TRUE`.
#' While identifying collocations for tokens objects is supported, you will
#' get better results with character or corpus objects due to relatively
#' imperfect detection of sentence boundaries from texts already tokenized.
#' span sentences. If `x` is a [tokens][quanteda::tokens] object and some tokens
#' have been removed, this should be done using `[tokens_remove](x, pattern,
#' padding = TRUE)` so that counts will still be accurate, but the pads will
#' prevent those collocations from being scored.
#' @param x a character, [corpus][quanteda::corpus], or
#' [tokens][quanteda::tokens] object whose collocations will be scored. The
#' tokens object should include punctuation, and if any words have been
#' removed, these should have been removed with `padding = TRUE`. While
#' identifying collocations for tokens objects is supported, you will get
#' better results with character or corpus objects due to relatively imperfect
#' detection of sentence boundaries from texts already tokenized.
#' @param method association measure for detecting collocations. Currently this
#' is limited to `"lambda"`. See Details.
#' @param size integer; the length of the collocations
Expand All @@ -24,7 +25,7 @@
#' (default is 0.5)
#' @param tolower logical; if `TRUE`, form collocations as lower-cased
#' combinations
#' @param ... additional arguments passed to [tokens()]
#' @param ... additional arguments passed to [tokens()][quanteda::tokens]
#' @references Blaheta, D. & Johnson, M. (2001). [Unsupervised learning of
#' multi-word
#' verbs](http://web.science.mq.edu.au/~mjohnson/papers/2001/dpb-colloc01.pdf).
Expand Down Expand Up @@ -160,7 +161,7 @@ textstat_collocations.tokens <- function(x, method = "lambda",
if (is.null(id_ignore)) id_ignore <- integer()
result <- cpp_collocations(x, types, id_ignore, min_count, size,
if (method == "lambda1") "lambda1" else "lambda",
smoothing, get_threads())
smoothing, quanteda:::get_threads())

# compute z for lambda methods
result$z <- result$lambda / result$sigma
Expand Down
12 changes: 6 additions & 6 deletions R/textstat_frequency.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
#' Tabulate feature frequencies
#'
#' Produces counts and document frequencies summaries of the features in a
#' [dfm], optionally grouped by a [docvars] variable or other supplied
#' grouping variable.
#' @param x a [dfm] object
#' [dfm][quanteda::dfm], optionally grouped by a [docvars][quanteda::docvars]
#' variable or other supplied grouping variable.
#' @param x a [dfm][quanteda::dfm] object
#' @param n (optional) integer specifying the top `n` features to be returned,
#' within group if `groups` is specified
#' @param ties_method character string specifying how ties are treated. See
#' [base::rank()] for details. Unlike that function, however, the default is
#' `"min"`, so that frequencies of 10, 10, 11 would be ranked 1, 1, 3.
#' @param ... additional arguments passed to [dfm_group()]. This can
#' be useful in passing `force = TRUE`, for instance, if you are grouping a
#' dfm that has been weighted.
#' @param ... additional arguments passed to [dfm_group()][quanteda::dfm_group].
#' This can be useful in passing `force = TRUE`, for instance, if you are
#' grouping a dfm that has been weighted.
#' @inheritParams quanteda::groups
#' @return a data.frame containing the following variables:
#' \describe{
Expand Down
7 changes: 4 additions & 3 deletions R/textstat_keyness.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
#'
#' Calculate "keyness", a score for features that occur differentially across
#' different categories. Here, the categories are defined by reference to a
#' "target" document index in the [dfm], with the reference group
#' "target" document index in the [dfm][quanteda::dfm], with the reference group
#' consisting of all other documents.
#' @param x a [dfm] containing the features to be examined for keyness
#' @param x a [dfm][quanteda::dfm] containing the features to be examined for
#' keyness
#' @param target the document index (numeric, character or logical) identifying
#' the document forming the "target" for computing keyness; all other
#' documents' feature frequencies will be combined for use as a reference
Expand Down Expand Up @@ -148,7 +149,7 @@ textstat_keyness.dfm <- function(x, target = 1L, measure = c("chi2", "exact", "l
warning("correction is always none for pmi")
result <- data.frame(
feature = featnames(temp),
stat = cpp_keyness(temp, measure, correction, get_threads()),
stat = cpp_keyness(temp, measure, correction, quanteda:::get_threads()),
p = NA,
n_target = as.vector(temp[1, ]),
n_reference = as.vector(temp[2, ]),
Expand Down
16 changes: 8 additions & 8 deletions R/textstat_lexdiv.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@
#' are ignored.}
#' }
#'
#' @param x an [dfm] or [tokens] input object for whose documents
#' lexical diversity will be computed
#' @param x an [dfm][quanteda::dfm] or [tokens][quanteda::tokens] input object
#' for whose documents lexical diversity will be computed
#' @param measure a character vector defining the measure to compute
#' @param remove_numbers logical; if `TRUE` remove features or tokens that
#' consist only of numerals (the Unicode "Number" `[N]` class)
Expand Down Expand Up @@ -280,7 +280,7 @@ textstat_lexdiv.tokens <-
#' @description
#' Internal functions used in [textstat_lexdiv()], for computing
#' lexical diversity measures on dfms or tokens objects
#' @param x a [dfm] object
#' @param x a [dfm][quanteda::dfm] object
#' @param measure a list of lexical diversity measures.
#' @return a `data.frame` with a `document` column containing the
#' input document name, followed by columns with the lexical diversity
Expand All @@ -293,7 +293,7 @@ NULL
#' @param log.base a numeric value defining the base of the logarithm (for
#' measures using logs)
#' @details `compute_lexdiv_dfm_stats` in an internal function that
#' computes the lexical diversity measures from a [dfm] input.
#' computes the lexical diversity measures from a [dfm][quanteda::dfm] input.
#' @importFrom quanteda ntoken ntype docnames
compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {

Expand Down Expand Up @@ -376,7 +376,7 @@ compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {

#' @rdname compute_lexdiv_stats
#' @details `compute_lexdiv_tokens_stats` in an internal function that
#' computes the lexical diversity measures from a [dfm] input.
#' computes the lexical diversity measures from a [dfm][quanteda::dfm] input.
#' @param MATTR_window a numeric value defining the size of the moving window
#' for computation of the Moving-Average Type-Token Ratio (Covington & McFall, 2010)
#' @param MSTTR_segment a numeric value defining the size of the each segment
Expand Down Expand Up @@ -407,7 +407,7 @@ compute_lexdiv_tokens_stats <- function(x, measure = c("MATTR", "MSTTR"),
#' from Covington & McFall (2010), averaging all of the sequential moving
#' windows of tokens of size `MATTR_window` across the text, returning the
#' average as the MATTR.
#' @param x a [tokens] object
#' @param x a [tokens][quanteda::tokens] object
#' @param MATTR_window integer; the size of the moving window for computation of
#' TTR, between 1 and the number of tokens of the document
#' @keywords internal textstat lexdiv
Expand All @@ -434,7 +434,7 @@ compute_mattr <- function(x, MATTR_window = 100L) {
#' Compute the Mean Segmental Type-Token Ratio (MSTTR)
#'
#' Compute the Mean Segmental Type-Token Ratio (Johnson 1944) for a tokens input.
#' @param x input [tokens]
#' @param x input [tokens][quanteda::tokens]
#' @inheritParams textstat_lexdiv
#' @keywords internal textstat lexdiv
compute_msttr <- function(x, MSTTR_segment) {
Expand Down Expand Up @@ -464,7 +464,7 @@ compute_msttr <- function(x, MSTTR_segment) {
#' Takes a dfm that contains features with hyphenated words, such as
#' "split-second" and turns them into features that split the elements
#' in the same was as `tokens(x, remove_hyphens = TRUE)` would have done.
#' @param x input [dfm]
#' @param x input [dfm][quanteda::dfm]
#' @keywords internal dfm
#' @importFrom quanteda featnames tokens dfm_compress
#' @importFrom stringi stri_detect_regex
Expand Down
10 changes: 6 additions & 4 deletions R/textstat_readability.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@
#' \item{`"Farr.Jenkins.Paterson"`:}{Farr-Jenkins-Paterson's
#' Simplification of Flesch's Reading Ease Score (Farr, Jenkins and Paterson 1951). \deqn{
#' -31.517 - (1.015 \times ASL) + (1.599 \times
#' \frac{n_{wsy=1}}{n_{w}}}{ -31.517
#' - (1.015 * ASL) + (1.599 * Nwsy1 / Nw)}
#' \frac{n_{wsy=1}}{n_{w}})}{ -31.517
#' - (1.015 * ASL) + (1.599 * Nwsy1 / Nw )}
#'
#' where \eqn{n_{wsy=1}} = Nwsy1 = the number of one-syllable words.}
#'
Expand Down Expand Up @@ -316,7 +316,8 @@
#'
#' }
#'
#' @param x a character or [corpus] object containing the texts
#' @param x a character or [corpus][quanteda::corpus] object containing the
#' texts
#' @param measure character vector defining the readability measure to calculate.
#' Matches are case-insensitive. See other valid measures under Details.
#' @param remove_hyphens if `TRUE`, treat constituent words in hyphenated as
Expand All @@ -330,7 +331,8 @@
#' other cruft that might be in the texts following conversion.
#'
#' For finer-grained control, consider filtering sentences prior first,
#' including through pattern-matching, using [corpus_trim()].
#' including through pattern-matching, using
#' [corpus_trim()][quanteda::corpus_trim].
#' @param intermediate if `TRUE`, include intermediate quantities in the output
#' @param ... not used
#' @importFrom quanteda texts char_trim nsentence char_tolower tokens_remove dfm
Expand Down
20 changes: 10 additions & 10 deletions R/textstat_simil.R
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,14 @@ setMethod("tail", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
#' Similarity and distance computation between documents or features
#'
#' These functions compute matrixes of distances and similarities between
#' documents or features from a [dfm()] and return a matrix of
#' similarities or distances in a sparse format. These methods are fast
#' and robust because they operate directly on the sparse [dfm] objects.
#' The output can easily be coerced to an ordinary matrix, a data.frame of
#' pairwise comparisons, or a [dist][stats::dist] format.
#' @param x,y a [dfm] objects; `y` is an optional target matrix matching
#' `x` in the margin on which the similarity or distance will be computed.
#' documents or features from a [dfm][quanteda::dfm] and return a matrix of
#' similarities or distances in a sparse format. These methods are fast and
#' robust because they operate directly on the sparse [dfm][quanteda::dfm]
#' objects. The output can easily be coerced to an ordinary matrix, a data.frame
#' of pairwise comparisons, or a [dist][stats::dist] format.
#' @param x,y a [dfm][quanteda::dfm] objects; `y` is an optional target matrix
#' matching `x` in the margin on which the similarity or distance will be
#' computed.
#' @param selection (deprecated - use `y` instead).
#' @param margin identifies the margin of the dfm on which similarity or
#' difference will be computed: `"documents"` for documents or
Expand Down Expand Up @@ -355,7 +356,6 @@ textstat_simil.dfm <- function(x, y = NULL, selection = NULL,
#' @details `textstat_dist` options are: `"euclidean"` (default),
#' `"manhattan"`, `"maximum"`, `"canberra"`,
#' and `"minkowski"`.
#' @importFrom RcppParallel RcppParallelLibs
#' @examples
#'
#' # distances for documents
Expand Down Expand Up @@ -644,8 +644,8 @@ setMethod("as.matrix", "textstat_simil_symm_sparse",
#' This is an underlying function for `textstat_dist` and
#' `textstat_simil` but returns `TsparseMatrix`.
#' @keywords internal
#' @param y if a [dfm] object is provided, proximity between documents or
#' features in `x` and `y` is computed.
#' @param y if a [dfm][quanteda::dfm] object is provided, proximity between
#' documents or features in `x` and `y` is computed.
#' @param use_na if `TRUE`, return `NA` for proximity to empty
#' vectors. Note that use of `NA` makes the proximity matrices denser.
#' @inheritParams textstat_dist
Expand Down
Loading

0 comments on commit 04a2cdf

Please sign in to comment.