From f0cccd6ee80f84d1281f562cae1ece19ad37598d Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Wed, 8 Jun 2022 03:39:44 -0400 Subject: [PATCH] make counting more robust to input datatype (#722) --- Project.toml | 3 +- src/counts.jl | 178 +++++++++++++++---------- src/weights.jl | 2 + test/counts.jl | 347 +++++++++++++++++++++++++++--------------------- test/weights.jl | 2 + 5 files changed, 304 insertions(+), 228 deletions(-) diff --git a/Project.toml b/Project.toml index c4781f7d9..6a35bceb7 100644 --- a/Project.toml +++ b/Project.toml @@ -28,8 +28,9 @@ julia = "1" [extras] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Dates", "DelimitedFiles", "StableRNGs", "Test"] +test = ["Dates", "DelimitedFiles", "OffsetArrays", "StableRNGs", "Test"] diff --git a/src/counts.jl b/src/counts.jl index 580870598..4333edc25 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -16,24 +16,24 @@ end #### functions for counting a single list of integers (1D) """ - addcounts!(r, x, levels::UnitRange{<:Int}, [wv::AbstractWeights]) + addcounts!(r, x, levels::UnitRange{<:Integer}, [wv::AbstractWeights]) Add the number of occurrences in `x` of each value in `levels` to an existing -array `r`. If a weighting vector `wv` is specified, the sum of weights is used -rather than the raw counts. +array `r`. For each `xi ∈ x`, if `xi == levels[j]`, then we increment `r[j]`. + +If a weighting vector `wv` is specified, the sum of weights is used rather than the +raw counts. """ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange) - # add counts of integers from x to r + # add counts of integers from x that fall within levels to r - k = length(levels) - length(r) == k || throw(DimensionMismatch()) + checkbounds(r, axes(levels)...) - m0 = levels[1] - m1 = levels[end] - b = m0 - 1 + m0 = first(levels) + m1 = last(levels) + b = m0 - firstindex(levels) # firstindex(levels) == 1 because levels::IntUnitRange - @inbounds for i in 1 : length(x) - xi = x[i] + @inbounds for xi in x if m0 <= xi <= m1 r[xi - b] += 1 end @@ -42,15 +42,21 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange) end function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) - k = length(levels) - length(r) == k || throw(DimensionMismatch()) + # add wv weighted counts of integers from x that fall within levels to r + + length(x) == length(wv) || + throw(DimensionMismatch("x and wv must have the same length, got $(length(x)) and $(length(wv))")) + + xv = vec(x) # discard shape because weights() discards shape + + checkbounds(r, axes(levels)...) - m0 = levels[1] - m1 = levels[end] + m0 = first(levels) + m1 = last(levels) b = m0 - 1 - @inbounds for i in 1 : length(x) - xi = x[i] + @inbounds for i in eachindex(xv, wv) + xi = xv[i] if m0 <= xi <= m1 r[xi - b] += wv[i] end @@ -69,8 +75,8 @@ falling in that range will be considered (the others will be ignored without raising an error or a warning). If an integer `k` is provided, only values in the range `1:k` will be considered. -If a weighting vector `wv` is specified, the sum of the weights is used rather than the -raw counts. +If a vector of weights `wv` is provided, the proportion of weights is computed rather +than the proportion of raw counts. The output is a vector of length `length(levels)`. """ @@ -90,8 +96,10 @@ counts(x::IntegerArray, wv::AbstractWeights) = counts(x, span(x), wv) proportions(x, levels=span(x), [wv::AbstractWeights]) Return the proportion of values in the range `levels` that occur in `x`. -Equivalent to `counts(x, levels) / length(x)`. If a weighting vector `wv` -is specified, the sum of the weights is used rather than the raw counts. +Equivalent to `counts(x, levels) / length(x)`. + +If a vector of weights `wv` is provided, the proportion of weights is computed rather +than the proportion of raw counts. """ proportions(x::IntegerArray, levels::IntUnitRange) = counts(x, levels) .* inv(length(x)) proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = @@ -101,6 +109,9 @@ proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = proportions(x, k::Integer, [wv::AbstractWeights]) Return the proportion of integers in 1 to `k` that occur in `x`. + +If a vector of weights `wv` is provided, the proportion of weights is computed rather +than the proportion of raw counts. """ proportions(x::IntegerArray, k::Integer) = proportions(x, 1:k) proportions(x::IntegerArray, k::Integer, wv::AbstractWeights) = proportions(x, 1:k, wv) @@ -110,26 +121,22 @@ proportions(x::IntegerArray, wv::AbstractWeights) = proportions(x, span(x), wv) #### functions for counting a single list of integers (2D) function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}) - # add counts of integers from x to r - - n = length(x) - length(y) == n || throw(DimensionMismatch()) + # add counts of pairs from zip(x,y) to r xlevels, ylevels = levels - kx = length(xlevels) - ky = length(ylevels) - size(r) == (kx, ky) || throw(DimensionMismatch()) - mx0 = xlevels[1] - mx1 = xlevels[end] - my0 = ylevels[1] - my1 = ylevels[end] + checkbounds(r, axes(xlevels, 1), axes(ylevels, 1)) + + mx0 = first(xlevels) + mx1 = last(xlevels) + my0 = first(ylevels) + my1 = last(ylevels) bx = mx0 - 1 by = my0 - 1 - for i = 1:n + for i in eachindex(vec(x), vec(y)) xi = x[i] yi = y[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) @@ -141,28 +148,31 @@ end function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) - # add counts of integers from x to r + # add counts of pairs from zip(x,y) to r + + length(x) == length(y) == length(wv) || + throw(DimensionMismatch("x, y, and wv must have the same length, but got $(length(x)), $(length(y)), and $(length(wv))")) - n = length(x) - length(y) == length(wv) == n || throw(DimensionMismatch()) + axes(x) == axes(y) || + throw(DimensionMismatch("x and y must have the same axes, but got $(axes(x)) and $(axes(y))")) + + xv, yv = vec(x), vec(y) # discard shape because weights() discards shape xlevels, ylevels = levels - kx = length(xlevels) - ky = length(ylevels) - size(r) == (kx, ky) || throw(DimensionMismatch()) + checkbounds(r, axes(xlevels, 1), axes(ylevels, 1)) - mx0 = xlevels[1] - mx1 = xlevels[end] - my0 = ylevels[1] - my1 = ylevels[end] + mx0 = first(xlevels) + mx1 = last(xlevels) + my0 = first(ylevels) + my1 = last(ylevels) bx = mx0 - 1 by = my0 - 1 - for i = 1:n - xi = x[i] - yi = y[i] + for i in eachindex(xv, yv, wv) + xi = xv[i] + yi = yv[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) r[xi - bx, yi - by] += wv[i] end @@ -235,13 +245,15 @@ end """ - addcounts!(dict, x[, wv]; alg = :auto) + addcounts!(dict, x; alg = :auto) + addcounts!(dict, x, wv) Add counts based on `x` to a count map. New entries will be added if new values come up. + If a weighting vector `wv` is specified, the sum of the weights is used rather than the raw counts. -`alg` can be one of: +`alg` is only allowed for unweighted counting and can be one of: - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use `:radixsort`, otherwise use `:dict`. @@ -284,9 +296,9 @@ function addcounts_dict!(cm::Dict{T}, x) where T end # If the bits type is of small size i.e. it can have up to 65536 distinct values -# then it is always better to apply a counting-sort like reduce algorithm for +# then it is always better to apply a counting-sort like reduce algorithm for # faster results and less memory usage. However we still wish to enable others -# to write generic algorithms, therefore the methods below still accept the +# to write generic algorithms, therefore the methods below still accept the # `alg` argument but it is ignored. function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) sumx = sum(x) @@ -335,32 +347,42 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128, "Can the type be safely sorted by radixsort" radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes -function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T +function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractVector{T}) where T isempty(sx) && return cm - last_sx = sx[1] - tmpcount = get(cm, last_sx, 0) + 1 + last_sx = first(sx) + start_i = firstindex(sx) # now the data is sorted: can just run through and accumulate values before # adding into the Dict - @inbounds for i in 2:length(sx) + @inbounds for i in start_i+1:lastindex(sx) sxi = sx[i] - if last_sx == sxi - tmpcount += 1 - else - cm[last_sx] = tmpcount + if last_sx != sxi + cm[last_sx] = get(cm, last_sx, 0) + i - start_i last_sx = sxi - tmpcount = get(cm, last_sx, 0) + 1 + start_i = i end end - cm[sx[end]] = tmpcount + last_sx = last(sx) + cm[last_sx] = get(cm, last_sx, 0) + lastindex(sx) + 1 - start_i return cm end +function _alg(x::AbstractArray) + @static if VERSION >= v"1.9.0-DEV" + return Base.DEFAULT_UNSTABLE + else + firstindex(x) == 1 || + throw(ArgumentError("alg = :radixsort requires either one based indexing or Julia >= 1.9. " * + "Use `alg = :dict` as an alternative.")) + return SortingAlgorithms.RadixSort + end +end + function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T # sort the x using radixsort - sx = sort(x, alg = RadixSort) + sx = sort(vec(x), alg=_alg(x)) # Delegate the loop to a separate function since sort might not # be inferred in Julia 0.6 after SortingAlgorithms is loaded. @@ -369,18 +391,24 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T end # fall-back for `x` an iterator -function addcounts_radixsort!(cm::Dict{T}, x) where T - sx = sort!(collect(x), alg = RadixSort) +function addcounts_radixsort!(cm::Dict{T}, x) where T + cx = vec(collect(x)) + sx = sort!(cx, alg = _alg(cx)) return _addcounts_radix_sort_loop!(cm, sx) end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} - n = length(x) - length(wv) == n || throw(DimensionMismatch()) + # add wv weighted counts of integers from x to cm + + length(x) == length(wv) || + throw(DimensionMismatch("x and wv must have the same length, got $(length(x)) and $(length(wv))")) + + xv = vec(x) # discard shape because weights() discards shape + z = zero(W) - for i = 1 : n - @inbounds xi = x[i] + for i in eachindex(xv, wv) + @inbounds xi = xv[i] @inbounds wi = wv[i] cm[xi] = get(cm, xi, z) + wi end @@ -390,11 +418,14 @@ end """ countmap(x; alg = :auto) - countmap(x::AbstractVector, w::AbstractVector{<:Real}; alg = :auto) + countmap(x::AbstractVector, wv::AbstractVector{<:Real}) -Return a dictionary mapping each unique value in `x` to its number -of occurrences. A vector of weights `w` can be provided when `x` is a vector. +Return a dictionary mapping each unique value in `x` to its number of occurrences. +If a weighting vector `wv` is specified, the sum of weights is used rather than the +raw counts. + +`alg` is only allowed for unweighted counting and can be one of: - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use `:radixsort`, otherwise use `:dict`. @@ -414,9 +445,12 @@ countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcoun """ proportionmap(x) + proportionmap(x::AbstractVector, w::AbstractVector{<:Real}) + +Return a dictionary mapping each unique value in `x` to its proportion in `x`. -Return a dictionary mapping each unique value in `x` to its -proportion in `x`. +If a vector of weights `wv` is provided, the proportion of weights is computed rather +than the proportion of raw counts. """ proportionmap(x::AbstractArray) = _normalize_countmap(countmap(x), length(x)) proportionmap(x::AbstractArray, wv::AbstractWeights) = _normalize_countmap(countmap(x, wv), sum(wv)) diff --git a/src/weights.jl b/src/weights.jl index 50043226b..9cd1a98db 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -21,6 +21,7 @@ length(wv::AbstractWeights) = length(wv.values) sum(wv::AbstractWeights) = wv.sum isempty(wv::AbstractWeights) = isempty(wv.values) size(wv::AbstractWeights) = size(wv.values) +Base.axes(wv::AbstractWeights) = Base.axes(wv.values) Base.dataids(wv::AbstractWeights) = Base.dataids(wv.values) @@ -301,6 +302,7 @@ sum(wv::UnitWeights{T}) where T = convert(T, length(wv)) isempty(wv::UnitWeights) = iszero(wv.len) length(wv::UnitWeights) = wv.len size(wv::UnitWeights) = tuple(length(wv)) +Base.axes(wv::UnitWeights) = tuple(Base.OneTo(length(wv))) Base.convert(::Type{Vector}, wv::UnitWeights{T}) where {T} = ones(T, length(wv)) diff --git a/test/counts.jl b/test/counts.jl index d7b6fea0b..f5d6ae69f 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -1,166 +1,203 @@ using StatsBase using Test +using OffsetArrays n = 5000 -# 1D integer counts - -x = rand(1:5, n) -w = weights(rand(n)) - -c = counts(x, 5) -@test size(c) == (5,) -c0 = Int[count(v->v == i, x) for i in 1:5] -@test c == c0 -@test counts(x .+ 1, 2:6) == c0 -@test proportions(x, 1:5) ≈ (c0 ./ n) - -c = counts(x) -@test size(c) == (5,) -c0 = Int[count(v->v == i, x) for i in 1:5] -@test c == c0 -@test counts(x .+ 1, 2:6) == c0 -@test proportions(x) ≈ (c0 ./ n) - -c = counts(x, 5, w) -@test size(c) == (5,) -c0 = Float64[sum(w.values[x .== i]) for i in 1:5] -@test c ≈ c0 -@test counts(x .+ 1, 2:6, w) ≈ c0 -@test proportions(x, 1:5, w) ≈ (c0 ./ sum(w)) - -c = counts(x, w) -@test size(c) == (5,) -c0 = Float64[sum(w.values[x .== i]) for i in 1:5] -@test c ≈ c0 -@test counts(x .+ 1, 2:6, w) ≈ c0 -@test proportions(x, w) ≈ (c0 ./ sum(w)) - -# 2D integer counts - -x = rand(1:4, n) -y = rand(1:5, n) -w = weights(rand(n)) - -c = counts(x, y, (4, 5)) -@test size(c) == (4, 5) -c0 = Int[count(t->t != 0, (x .== i) .& (y .== j)) for i in 1:4, j in 1:5] -@test c == c0 -@test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 -@test proportions(x, y, (1:4, 1:5)) ≈ (c0 ./ n) - -c = counts(x, y) -@test size(c) == (4, 5) -c0 = Int[count(t->t != 0, (x .== i) .& (y .== j)) for i in 1:4, j in 1:5] -@test c == c0 -@test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 -@test proportions(x, y,) ≈ (c0 ./ n) - -c = counts(x, y, (4, 5), w) -@test size(c) == (4, 5) -c0 = Float64[sum(w.values[(x .== i) .& (y .== j)]) for i in 1:4, j in 1:5] -@test c ≈ c0 -@test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 -@test proportions(x, y, (1:4, 1:5), w) ≈ (c0 ./ sum(w)) - -c = counts(x, y, w) -@test size(c) == (4, 5) -c0 = Float64[sum(w.values[(x .== i) .& (y .== j)]) for i in 1:4, j in 1:5] -@test c ≈ c0 -@test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 -@test proportions(x, y, w) ≈ (c0 ./ sum(w)) - - -# count map - -x = ["a", "b", "a", "a", "b", "c"] -w = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5] - -cm = countmap(x) -@test cm["a"] == 3 -@test cm["b"] == 2 -@test cm["c"] == 1 - -# iterator, non-radixsort -cm_missing = countmap(skipmissing(x)) -cm_any_itr = countmap((i for i in x)) -@test cm_missing == cm_any_itr == cm -@test cm_missing isa Dict{String, Int} -@test cm_any_itr isa Dict{Any, Int} - -pm = proportionmap(x) -@test pm["a"] ≈ (1/2) -@test pm["b"] ≈ (1/3) -@test pm["c"] ≈ (1/6) - - -# testing the radixsort branch of countmap -xx = repeat([6, 1, 3, 1], outer=100_000) -cm = countmap(xx) -@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) - -# with iterator -cm_missing = countmap(skipmissing(xx)) -@test cm_missing isa Dict{Int, Int} -@test cm_missing == cm - -cm_any_itr = countmap((i for i in xx)) -@test cm_any_itr isa Dict{Any,Int} # no knowledge about type -@test cm_missing == cm - -# with empty array -@test countmap(Int[]) == Dict{Int, Int}() - -# testing the radixsort-based addcounts -xx = repeat([6, 1, 3, 1], outer=100_000) -cm = Dict{Int, Int}() -StatsBase.addcounts_radixsort!(cm,xx) -@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) -xx2 = repeat([7, 1, 3, 1], outer=100_000) -StatsBase.addcounts_radixsort!(cm,xx2) -@test cm == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) -# with iterator -cm_missing = Dict{Int, Int}() -StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx)) -@test cm_missing == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) -StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx2)) -@test cm_missing == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) - -# testing the Dict-based addcounts -cm = Dict{Int, Int}() -cm_itr = Dict{Int, Int}() -StatsBase.addcounts_dict!(cm,xx) -StatsBase.addcounts_dict!(cm_itr,skipmissing(xx)) -@test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) -@test cm_itr isa Dict{Int, Int} - -cm = countmap(x, weights(w)) -@test cm["a"] == 5.5 -@test cm["b"] == 4.5 -@test cm["c"] == 3.5 - -@test cm == countmap(x, w) - -pm = proportionmap(x, weights(w)) -@test pm["a"] ≈ (5.5 / 13.5) -@test pm["b"] ≈ (4.5 / 13.5) -@test pm["c"] ≈ (3.5 / 13.5) - -# testing small bits type -bx = [true, false, true, true, false] -cm_bx_missing = countmap(skipmissing(bx)) -@test cm_bx_missing == countmap(bx) == Dict(true => 3, false => 2) -@test cm_bx_missing isa Dict{Bool, Int} - -for T in [UInt8, UInt16, Int8, Int16] - tx = T[typemin(T), 8, typemax(T), 19, 8] - tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8]) - cm_tx_missing = countmap(tx_missing) - @test cm_tx_missing == countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) - @test cm_tx_missing isa Dict{T, Int} +@testset "1D integer counts" begin + x = rand(1:5, n) + w = weights(rand(n)) + x0 = deepcopy(x) + w0 = deepcopy(w) + + c0 = Int[count(v->v == i, x) for i in 1:5] + @test counts(x, 5) == c0 + @test counts(x .+ 1, 2:6) == c0 + @test proportions(x, 1:5) ≈ (c0 ./ n) + @test counts(reshape(x, 10, 50, 10), 5) == c0 + + @test counts(x) == c0 + @test proportions(x) ≈ (c0 ./ n) + @test counts(reshape(x, 10, 50, 10)) == c0 + + c0 = reshape(c0, 1, 5) + @test addcounts!(fill(0, 1, 5), x, 1:5) == c0 + @test addcounts!(fill(0, 1, 5), reshape(x, 10, 50, 10), 1:5) == c0 + + c0 = Float64[sum(w.values[x .== i]) for i in 1:5] + @test counts(x, 5, w) ≈ c0 + @test counts(x .+ 1, 2:6, w) ≈ c0 + @test proportions(x, 1:5, w) ≈ (c0 ./ sum(w)) + @test counts(reshape(x, 10, 50, 10), 5, w) ≈ c0 # Perhaps this should not be allowed + + @test counts(x, w) ≈ c0 + @test counts(x .+ 1, 2:6, w) ≈ c0 + @test proportions(x, w) ≈ (c0 ./ sum(w)) + @test counts(reshape(x, 10, 50, 10), w) ≈ c0 # Perhaps this should not be allowed + + #addcounts! to row matrix + c0 = reshape(c0, 1, 5) + @test addcounts!(fill(0.0, 1, 5), x, 1:5, w) ≈ c0 + @test addcounts!(fill(0.0, 1, 5), reshape(x, 10, 50, 10), 1:5, w) ≈ c0 # Perhaps this should not be allowed + + @test x == x0 + @test w == w0 +end + + +@testset "2D integer counts" begin + x = rand(1:4, n) + y = rand(1:5, n) + w = weights(rand(n)) + x0 = deepcopy(x) + y0 = deepcopy(y) + w0 = deepcopy(w) + + c0 = Int[count(t->t != 0, (x .== i) .& (y .== j)) for i in 1:4, j in 1:5] + @test counts(x, y, (4, 5)) == c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 + @test proportions(x, y, (1:4, 1:5)) ≈ (c0 ./ n) + @test counts(reshape(x, 10, 50, 10), reshape(y, 10, 50, 10), (4, 5)) == c0 + + @test counts(x, y) == c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 + @test proportions(x, y,) ≈ (c0 ./ n) + @test counts(reshape(x, 10, 50, 10), reshape(y, 10, 50, 10)) == c0 + + c0 = Float64[sum(w.values[(x .== i) .& (y .== j)]) for i in 1:4, j in 1:5] + @test counts(x, y, (4, 5), w) ≈ c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 + @test proportions(x, y, (1:4, 1:5), w) ≈ (c0 ./ sum(w)) + @test counts(reshape(x, 10, 50, 10), reshape(y, 10, 50, 10), (4, 5), w) ≈ c0 # Perhaps this should not be allowed + + @test counts(x, y, w) ≈ c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 + @test proportions(x, y, w) ≈ (c0 ./ sum(w)) + @test counts(reshape(x, 10, 50, 10), reshape(y, 10, 50, 10), w) ≈ c0 # Perhaps this should not be allowed + + @test x == x0 + @test y == y0 + @test w == w0 +end + +@testset "count map" begin + x = ["a", "b", "a", "a", "b", "c"] + w = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5] + + cm = countmap(x) + @test cm["a"] == 3 + @test cm["b"] == 2 + @test cm["c"] == 1 + + # iterator, non-radixsort + cm_missing = countmap(skipmissing(x)) + cm_any_itr = countmap((i for i in x)) + @test cm_missing == cm_any_itr == cm + @test cm_missing isa Dict{String, Int} + @test cm_any_itr isa Dict{Any, Int} + + pm = proportionmap(x) + @test pm["a"] ≈ (1/2) + @test pm["b"] ≈ (1/3) + @test pm["c"] ≈ (1/6) + + + # testing the radixsort branch of countmap + xx = repeat([6, 1, 3, 1], outer=100_000) + cm = countmap(xx) + @test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) + + # with iterator + cm_missing = countmap(skipmissing(xx)) + @test cm_missing isa Dict{Int, Int} + @test cm_missing == cm + + cm_any_itr = countmap((i for i in xx)) + @test cm_any_itr isa Dict{Any,Int} # no knowledge about type + @test cm_any_itr == cm + + # with multidimensional array + @test countmap(reshape(xx, 20, 100, 20, 10); alg=:radixsort) == cm + @test countmap(reshape(xx, 20, 100, 20, 10); alg=:dict) == cm + + # with empty array + @test countmap(Int[]) == Dict{Int, Int}() + + # testing the radixsort-based addcounts + xx = repeat([6, 1, 3, 1], outer=100_000) + cm = Dict{Int, Int}() + StatsBase.addcounts_radixsort!(cm,xx) + @test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) + xx2 = repeat([7, 1, 3, 1], outer=100_000) + StatsBase.addcounts_radixsort!(cm,xx2) + @test cm == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) + # with iterator + cm_missing = Dict{Int, Int}() + StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx)) + @test cm_missing == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) + StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx2)) + @test cm_missing == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) + + # testing the Dict-based addcounts + cm = Dict{Int, Int}() + cm_itr = Dict{Int, Int}() + StatsBase.addcounts_dict!(cm,xx) + StatsBase.addcounts_dict!(cm_itr,skipmissing(xx)) + @test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) + @test cm_itr isa Dict{Int, Int} + + cm = countmap(x, weights(w)) + @test cm["a"] == 5.5 + @test cm["b"] == 4.5 + @test cm["c"] == 3.5 + + @test cm == countmap(x, w) + @test cm == countmap(reshape(x, 2, 3), w) + + pm = proportionmap(x, weights(w)) + @test pm["a"] ≈ (5.5 / 13.5) + @test pm["b"] ≈ (4.5 / 13.5) + @test pm["c"] ≈ (3.5 / 13.5) + + # testing small bits type + bx = [true, false, true, true, false] + cm_bx_missing = countmap(skipmissing(bx)) + @test cm_bx_missing == countmap(bx) == Dict(true => 3, false => 2) + @test cm_bx_missing isa Dict{Bool, Int} + + for T in [UInt8, UInt16, Int8, Int16] + tx = T[typemin(T), 8, typemax(T), 19, 8] + tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8]) + cm_tx_missing = countmap(tx_missing) + @test cm_tx_missing == countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) + @test cm_tx_missing isa Dict{T, Int} + end end @testset "views" begin X = view([1,1,1,2,2], 1:5) @test countmap(X) == countmap(copy(X)) end + +if VERSION >= v"1.9.0-DEV" + @testset "offset arrays" begin + x = rand(1:5, n) + w = rand(n) + xw = weights(w) + y = OffsetArray(x, n÷2) + yw = weights(OffsetArray(w, n÷2)) + z = OffsetArray(x, -2n) + zw = weights(OffsetArray(w, -2n)) + + # proportions calls counts which calls addcounts! + @test proportions(x) == proportions(y) == proportions(z) + @test proportions(x, xw) == proportions(y, yw) == proportions(z, zw) + @test proportionmap(x) == proportionmap(y) == proportionmap(z) + @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) + @test countmap(x) == countmap(x; alg = :dict) == countmap(x; alg = :radixsort) == + countmap(y) == countmap(y; alg = :dict) == countmap(y; alg = :radixsort) == + countmap(z) == countmap(z; alg = :dict) == countmap(z; alg = :radixsort) + @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) + # countmap and proportionmap only support the :dict algorithm for weighted sums. + end +end diff --git a/test/weights.jl b/test/weights.jl index 8562c5691..e8f7febe8 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -13,6 +13,7 @@ weight_funcs = (weights, aweights, fweights, pweights) @test isempty(f(Float64[])) @test size(f([1, 2, 3])) == (3,) + @test axes(f([1, 2, 3])) == (Base.OneTo(3),) w = [1., 2., 3.] wv = f(w) @@ -107,6 +108,7 @@ end @test !isempty(wv) @test length(wv) === 3 @test size(wv) === (3,) + @test axes(wv) === (Base.OneTo(3),) @test sum(wv) === 3. @test wv == fill(1.0, 3) @test StatsBase.varcorrection(wv) == 1/3