From 3905f6eaa01a16d1321cea1696cce7d6d3c36ca0 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Sat, 2 Oct 2021 13:43:15 -0500 Subject: [PATCH 01/43] support offset arrays and simplify _addcounts_radix_sort_loop --- src/counts.jl | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 580870598..8af1ebb71 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -337,23 +337,22 @@ radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T isempty(sx) && return cm - last_sx = sx[1] - tmpcount = get(cm, last_sx, 0) + 1 + last_sx = first(sx) + start_i = firstindex(sx) # now the data is sorted: can just run through and accumulate values before # adding into the Dict - @inbounds for i in 2:length(sx) + @inbounds for i in start_i+1:lastindex(sx) sxi = sx[i] - if last_sx == sxi - tmpcount += 1 - else - cm[last_sx] = tmpcount + if last_sx != sxi + cm[last_sx] = get(cm, last_sx, 0) + i - start_i last_sx = sxi - tmpcount = get(cm, last_sx, 0) + 1 + start_i = i end end - cm[sx[end]] = tmpcount + last_sx = last(sx) + cm[last_sx] = get(cm, last_sx, 0) + length(sx) + 1 - start_i return cm end From 8eb5855938aa724e25c966a9ab734b0e611fc180 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Wed, 6 Oct 2021 20:23:49 -0500 Subject: [PATCH 02/43] add tests and fix more occurances of unsupported offset arrays [ref](https://docs.julialang.org/en/v1/devdocs/offset-arrays/#Things-to-watch-out-for) --- src/counts.jl | 77 +++++++++++++++++++++++++------------------------- src/weights.jl | 3 +- test/counts.jl | 24 +++++++++++++++- 3 files changed, 63 insertions(+), 41 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 8af1ebb71..f47c4cfba 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -23,16 +23,15 @@ array `r`. If a weighting vector `wv` is specified, the sum of weights is used rather than the raw counts. """ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange) - # add counts of integers from x to r + # add counts of integers from x that fall within levels to r - k = length(levels) - length(r) == k || throw(DimensionMismatch()) + @boundscheck checkbounds(r, axes(levels)...) - m0 = levels[1] - m1 = levels[end] + m0 = first(levels) + m1 = last(levels) b = m0 - 1 - @inbounds for i in 1 : length(x) + @inbounds for i in eachindex(x) xi = x[i] if m0 <= xi <= m1 r[xi - b] += 1 @@ -42,14 +41,16 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange) end function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) - k = length(levels) - length(r) == k || throw(DimensionMismatch()) + # add wv weighted counts of integers from x that fall within levels to r + + @boundscheck checkbounds(r, axes(levels)...) + @boundscheck axes(x) == axes(wv) || throw(DimensionMismatch("x and wv must have the same axes")) - m0 = levels[1] - m1 = levels[end] + m0 = first(levels) + m1 = last(levels) b = m0 - 1 - @inbounds for i in 1 : length(x) + @inbounds for i in eachindex(x) xi = x[i] if m0 <= xi <= m1 r[xi - b] += wv[i] @@ -112,24 +113,23 @@ proportions(x::IntegerArray, wv::AbstractWeights) = proportions(x, span(x), wv) function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}) # add counts of integers from x to r - n = length(x) - length(y) == n || throw(DimensionMismatch()) + @boundscheck( axes(x) == axes(y) || + throw(DimensionMismatch("x and y must have the same axes"))) xlevels, ylevels = levels - kx = length(xlevels) - ky = length(ylevels) - size(r) == (kx, ky) || throw(DimensionMismatch()) + @boundscheck checkbounds(r, axes(xlevels, 1), axes(ylevels, 1)) - mx0 = xlevels[1] - mx1 = xlevels[end] - my0 = ylevels[1] - my1 = ylevels[end] + + mx0 = first(xlevels) + mx1 = last(xlevels) + my0 = first(ylevels) + my1 = last(ylevels) bx = mx0 - 1 by = my0 - 1 - for i = 1:n + for i = eachindex(x) xi = x[i] yi = y[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) @@ -143,24 +143,22 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) # add counts of integers from x to r - n = length(x) - length(y) == length(wv) == n || throw(DimensionMismatch()) + @boundscheck(axes(x) == axes(y) == axes(wv) || + throw(DimensionMismatch("x, y, and wv must have the same axes"))) xlevels, ylevels = levels - kx = length(xlevels) - ky = length(ylevels) - size(r) == (kx, ky) || throw(DimensionMismatch()) + @boundscheck checkbounds(r, axes(xlevels, 1), axes(ylevels, 1)) - mx0 = xlevels[1] - mx1 = xlevels[end] - my0 = ylevels[1] - my1 = ylevels[end] + mx0 = first(xlevels) + mx1 = last(xlevels) + my0 = first(ylevels) + my1 = last(ylevels) bx = mx0 - 1 by = my0 - 1 - for i = 1:n + for i = eachindex(x) xi = x[i] yi = y[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) @@ -284,9 +282,9 @@ function addcounts_dict!(cm::Dict{T}, x) where T end # If the bits type is of small size i.e. it can have up to 65536 distinct values -# then it is always better to apply a counting-sort like reduce algorithm for +# then it is always better to apply a counting-sort like reduce algorithm for # faster results and less memory usage. However we still wish to enable others -# to write generic algorithms, therefore the methods below still accept the +# to write generic algorithms, therefore the methods below still accept the # `alg` argument but it is ignored. function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) sumx = sum(x) @@ -352,7 +350,7 @@ function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T end last_sx = last(sx) - cm[last_sx] = get(cm, last_sx, 0) + length(sx) + 1 - start_i + cm[last_sx] = get(cm, last_sx, 0) + firstindex(sx) + 1 - start_i return cm end @@ -368,17 +366,18 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T end # fall-back for `x` an iterator -function addcounts_radixsort!(cm::Dict{T}, x) where T +function addcounts_radixsort!(cm::Dict{T}, x) where T sx = sort!(collect(x), alg = RadixSort) return _addcounts_radix_sort_loop!(cm, sx) end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} - n = length(x) - length(wv) == n || throw(DimensionMismatch()) + @boundscheck(axes(x) == axes(wv) || + throw(DimensionMismatch("x and wv must have the same axes"))) + z = zero(W) - for i = 1 : n + for i = eachindex(x) @inbounds xi = x[i] @inbounds wi = wv[i] cm[xi] = get(cm, xi, z) + wi @@ -417,5 +416,5 @@ countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcoun Return a dictionary mapping each unique value in `x` to its proportion in `x`. """ -proportionmap(x::AbstractArray) = _normalize_countmap(countmap(x), length(x)) +proportionmap(x::AbstractArray; alg = :auto) = _normalize_countmap(countmap(x; alg = alg), length(x)) proportionmap(x::AbstractArray, wv::AbstractWeights) = _normalize_countmap(countmap(x, wv), sum(wv)) diff --git a/src/weights.jl b/src/weights.jl index 34fe4cd77..af509722e 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -21,6 +21,7 @@ length(wv::AbstractWeights) = length(wv.values) sum(wv::AbstractWeights) = wv.sum isempty(wv::AbstractWeights) = isempty(wv.values) size(wv::AbstractWeights) = size(wv.values) +Base.axes(wv::AbstractWeights) = Base.axes(wv.values) Base.convert(::Type{Vector}, wv::AbstractWeights) = convert(Vector, wv.values) @@ -310,7 +311,7 @@ julia> uweights(3) 1 1 1 - + julia> uweights(Float64, 3) 3-element UnitWeights{Float64}: 1.0 diff --git a/test/counts.jl b/test/counts.jl index d7b6fea0b..bdc35e5d2 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -1,5 +1,6 @@ using StatsBase using Test +using OffsetArrays n = 5000 @@ -104,7 +105,7 @@ cm_missing = countmap(skipmissing(xx)) @test cm_missing isa Dict{Int, Int} @test cm_missing == cm -cm_any_itr = countmap((i for i in xx)) +cm_any_itr = countmap((i for i in xx)) @test cm_any_itr isa Dict{Any,Int} # no knowledge about type @test cm_missing == cm @@ -164,3 +165,24 @@ end X = view([1,1,1,2,2], 1:5) @test countmap(X) == countmap(copy(X)) end + +@testset "offset arrays" begin + x = rand(1:5, n) + w = rand(n) + xw = weights(w) + y = OffsetArray(x, n÷2) + yw = weights(OffsetArray(w, n÷2)) + z = OffsetArray(x, -2n) + zw = weights(OffsetArray(w, -2n)) + + # proportions calls counts which calls addcounts! + @test proportions(x) == proportions(y) == proportions(z) + @test proportions(x, xw) == proportions(y, yw) == proportions(z, zw) + @test proportionmap(x) == proportionmap(y) == proportionmap(z) + @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) + @test (proportionmap(x) == proportionmap(x; alg = :dict) == proportionmap(x; alg = :radixsort) + == proportionmap(y) == proportionmap(y; alg = :dict) == proportionmap(y; alg = :radixsort) + == proportionmap(z) == proportionmap(z; alg = :dict) == proportionmap(z; alg = :radixsort)) + @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) + # countmap and proportionmap only support the :dict algorithm for weighted sums. +end From 84ef887174bf0497f49433d22dc13406540c8a9a Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sun, 10 Oct 2021 17:54:47 -0500 Subject: [PATCH 03/43] Apply suggestion from code review Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index f47c4cfba..5d7d4add6 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -31,8 +31,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange) m1 = last(levels) b = m0 - 1 - @inbounds for i in eachindex(x) - xi = x[i] + @inbounds for xi in x if m0 <= xi <= m1 r[xi - b] += 1 end From d82295db69c3c27d0fdfa1fac04ec0c77ae49001 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sun, 10 Oct 2021 18:05:03 -0500 Subject: [PATCH 04/43] Replace dimension checking with varargs `eachindex` Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 5d7d4add6..e4f8adba7 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -43,13 +43,12 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv: # add wv weighted counts of integers from x that fall within levels to r @boundscheck checkbounds(r, axes(levels)...) - @boundscheck axes(x) == axes(wv) || throw(DimensionMismatch("x and wv must have the same axes")) m0 = first(levels) m1 = last(levels) b = m0 - 1 - @inbounds for i in eachindex(x) + @inbounds for i in eachindex(x, wv) xi = x[i] if m0 <= xi <= m1 r[xi - b] += wv[i] @@ -112,8 +111,6 @@ proportions(x::IntegerArray, wv::AbstractWeights) = proportions(x, span(x), wv) function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}) # add counts of integers from x to r - @boundscheck( axes(x) == axes(y) || - throw(DimensionMismatch("x and y must have the same axes"))) xlevels, ylevels = levels @@ -128,7 +125,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels:: bx = mx0 - 1 by = my0 - 1 - for i = eachindex(x) + for i in eachindex(x, y) xi = x[i] yi = y[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) @@ -142,8 +139,6 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) # add counts of integers from x to r - @boundscheck(axes(x) == axes(y) == axes(wv) || - throw(DimensionMismatch("x, y, and wv must have the same axes"))) xlevels, ylevels = levels @@ -157,7 +152,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, bx = mx0 - 1 by = my0 - 1 - for i = eachindex(x) + for i in eachindex(x, y, wv) xi = x[i] yi = y[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) @@ -371,12 +366,10 @@ function addcounts_radixsort!(cm::Dict{T}, x) where T end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} - @boundscheck(axes(x) == axes(wv) || - throw(DimensionMismatch("x and wv must have the same axes"))) z = zero(W) - for i = eachindex(x) + for i in eachindex(x, wv) @inbounds xi = x[i] @inbounds wi = wv[i] cm[xi] = get(cm, xi, z) + wi From a4505b2b08e83f15ced3bb77c4d8dac883664232 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sun, 10 Oct 2021 18:09:20 -0500 Subject: [PATCH 05/43] Test addcounts! on row vector --- test/counts.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/counts.jl b/test/counts.jl index bdc35e5d2..89ebda351 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -14,6 +14,7 @@ c = counts(x, 5) c0 = Int[count(v->v == i, x) for i in 1:5] @test c == c0 @test counts(x .+ 1, 2:6) == c0 +@test addcounts!([0 0 0 0 0], x, 1:5) == reshape(c0, 1, 5) @test proportions(x, 1:5) ≈ (c0 ./ n) c = counts(x) From c334d4d7efe2690e493b048b33543f060f64332b Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sun, 10 Oct 2021 18:33:45 -0500 Subject: [PATCH 06/43] Support multidimensional arrays for :radixsort --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index e4f8adba7..021339f74 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -351,7 +351,7 @@ end function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T # sort the x using radixsort - sx = sort(x, alg = RadixSort) + sx = sort(vec(x), alg = RadixSort) # Delegate the loop to a separate function since sort might not # be inferred in Julia 0.6 after SortingAlgorithms is loaded. From 94b3e6c716c5d944dc1eb2c53c079f41006b2f2b Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Mon, 11 Oct 2021 09:27:50 -0500 Subject: [PATCH 07/43] fix lastindex _addcounts_radix_sort_loop! indexing --- src/counts.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 021339f74..8924fb5d5 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -330,7 +330,7 @@ radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T isempty(sx) && return cm last_sx = first(sx) - start_i = firstindex(sx) + start_i = firstindex(sx)::Integer # now the data is sorted: can just run through and accumulate values before # adding into the Dict @@ -344,7 +344,7 @@ function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T end last_sx = last(sx) - cm[last_sx] = get(cm, last_sx, 0) + firstindex(sx) + 1 - start_i + cm[last_sx] = get(cm, last_sx, 0) + lastindex(sx) + 1 - start_i return cm end From 87074b26aedeec3131be3ee1ef4e5c4c3709a3a3 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Mon, 11 Oct 2021 09:55:46 -0500 Subject: [PATCH 08/43] organize and extend testing; revert to flattening x when passed weights for compatability. --- src/counts.jl | 3 + test/counts.jl | 328 +++++++++++++++++++++++++------------------------ 2 files changed, 172 insertions(+), 159 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 8924fb5d5..4b74b8597 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -42,6 +42,8 @@ end function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) # add wv weighted counts of integers from x that fall within levels to r + x = vec(x) # discard shape because weights() discards shape + @boundscheck checkbounds(r, axes(levels)...) m0 = first(levels) @@ -139,6 +141,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) # add counts of integers from x to r + x, y = vec(x), vec(y) # discard shape because weights() discards shape xlevels, ylevels = levels diff --git a/test/counts.jl b/test/counts.jl index 89ebda351..f81f2ea9a 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -2,164 +2,174 @@ using StatsBase using Test using OffsetArrays +#TODO: firstindex -> lastindex +#TEST multidimensional input on radix (and dict) + n = 5000 +_reshape(x) = reshape(x, 10, 50, 10) + +@testset "1D integer counts" begin + x = rand(1:5, n) + w = weights(rand(n)) + x0 = deepcopy(x) + w0 = deepcopy(w) + + c0 = Int[count(v->v == i, x) for i in 1:5] + @test counts(x, 5) == c0 + @test counts(x .+ 1, 2:6) == c0 + @test proportions(x, 1:5) ≈ (c0 ./ n) + @test counts(_reshape(x), 5) == c0 + + @test counts(x) == c0 + @test proportions(x) ≈ (c0 ./ n) + @test counts(_reshape(x)) == c0 + + c0 = reshape(c0, 1, 5) + @test addcounts!(fill(0, 1, 5), x, 1:5) == c0 + @test addcounts!(fill(0, 1, 5), _reshape(x), 1:5) == c0 + + c0 = Float64[sum(w.values[x .== i]) for i in 1:5] + @test counts(x, 5, w) ≈ c0 + @test counts(x .+ 1, 2:6, w) ≈ c0 + @test proportions(x, 1:5, w) ≈ (c0 ./ sum(w)) + @test counts(_reshape(x), 5, w) ≈ c0 # Perhaps this should not be allowed + + @test counts(x, w) ≈ c0 + @test counts(x .+ 1, 2:6, w) ≈ c0 + @test proportions(x, w) ≈ (c0 ./ sum(w)) + @test counts(_reshape(x), w) ≈ c0 # Perhaps this should not be allowed + + #addcounts! to row vector + c0 = reshape(c0, 1, 5) + @test addcounts!(fill(0.0, 1, 5), x, 1:5, w) ≈ c0 + @test addcounts!(fill(0.0, 1, 5), _reshape(x), 1:5, w) ≈ c0 # Perhaps this should not be allowed + + @test x == x0 + @test w == w0 +end + + +@testset "2D integer counts" begin + x = rand(1:4, n) + y = rand(1:5, n) + x0 = deepcopy(x) + y0 = deepcopy(y) + w0 = deepcopy(w) + + c0 = Int[count(t->t != 0, (x .== i) .& (y .== j)) for i in 1:4, j in 1:5] + @test counts(x, y, (4, 5)) == c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 + @test proportions(x, y, (1:4, 1:5)) ≈ (c0 ./ n) + @test counts(_reshape(x), _reshape(y), (4, 5)) == c0 + + @test counts(x, y) == c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 + @test proportions(x, y,) ≈ (c0 ./ n) + @test counts(_reshape(x), _reshape(y)) == c0 + + c0 = Float64[sum(w.values[(x .== i) .& (y .== j)]) for i in 1:4, j in 1:5] + @test counts(x, y, (4, 5), w) ≈ c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 + @test proportions(x, y, (1:4, 1:5), w) ≈ (c0 ./ sum(w)) + @test counts(_reshape(x), _reshape(y), (4, 5), w) ≈ c0 # Perhaps this should not be allowed + + @test counts(x, y, w) ≈ c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 + @test proportions(x, y, w) ≈ (c0 ./ sum(w)) + @test counts(_reshape(x), _reshape(y), w) ≈ c0 # Perhaps this should not be allowed + + @test x == x0 + @test y == y0 + @test w == w0 +end -# 1D integer counts - -x = rand(1:5, n) -w = weights(rand(n)) - -c = counts(x, 5) -@test size(c) == (5,) -c0 = Int[count(v->v == i, x) for i in 1:5] -@test c == c0 -@test counts(x .+ 1, 2:6) == c0 -@test addcounts!([0 0 0 0 0], x, 1:5) == reshape(c0, 1, 5) -@test proportions(x, 1:5) ≈ (c0 ./ n) - -c = counts(x) -@test size(c) == (5,) -c0 = Int[count(v->v == i, x) for i in 1:5] -@test c == c0 -@test counts(x .+ 1, 2:6) == c0 -@test proportions(x) ≈ (c0 ./ n) - -c = counts(x, 5, w) -@test size(c) == (5,) -c0 = Float64[sum(w.values[x .== i]) for i in 1:5] -@test c ≈ c0 -@test counts(x .+ 1, 2:6, w) ≈ c0 -@test proportions(x, 1:5, w) ≈ (c0 ./ sum(w)) - -c = counts(x, w) -@test size(c) == (5,) -c0 = Float64[sum(w.values[x .== i]) for i in 1:5] -@test c ≈ c0 -@test counts(x .+ 1, 2:6, w) ≈ c0 -@test proportions(x, w) ≈ (c0 ./ sum(w)) - -# 2D integer counts - -x = rand(1:4, n) -y = rand(1:5, n) -w = weights(rand(n)) - -c = counts(x, y, (4, 5)) -@test size(c) == (4, 5) -c0 = Int[count(t->t != 0, (x .== i) .& (y .== j)) for i in 1:4, j in 1:5] -@test c == c0 -@test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 -@test proportions(x, y, (1:4, 1:5)) ≈ (c0 ./ n) - -c = counts(x, y) -@test size(c) == (4, 5) -c0 = Int[count(t->t != 0, (x .== i) .& (y .== j)) for i in 1:4, j in 1:5] -@test c == c0 -@test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 -@test proportions(x, y,) ≈ (c0 ./ n) - -c = counts(x, y, (4, 5), w) -@test size(c) == (4, 5) -c0 = Float64[sum(w.values[(x .== i) .& (y .== j)]) for i in 1:4, j in 1:5] -@test c ≈ c0 -@test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 -@test proportions(x, y, (1:4, 1:5), w) ≈ (c0 ./ sum(w)) - -c = counts(x, y, w) -@test size(c) == (4, 5) -c0 = Float64[sum(w.values[(x .== i) .& (y .== j)]) for i in 1:4, j in 1:5] -@test c ≈ c0 -@test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 -@test proportions(x, y, w) ≈ (c0 ./ sum(w)) - - -# count map - -x = ["a", "b", "a", "a", "b", "c"] -w = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5] - -cm = countmap(x) -@test cm["a"] == 3 -@test cm["b"] == 2 -@test cm["c"] == 1 - -# iterator, non-radixsort -cm_missing = countmap(skipmissing(x)) -cm_any_itr = countmap((i for i in x)) -@test cm_missing == cm_any_itr == cm -@test cm_missing isa Dict{String, Int} -@test cm_any_itr isa Dict{Any, Int} - -pm = proportionmap(x) -@test pm["a"] ≈ (1/2) -@test pm["b"] ≈ (1/3) -@test pm["c"] ≈ (1/6) - - -# testing the radixsort branch of countmap -xx = repeat([6, 1, 3, 1], outer=100_000) -cm = countmap(xx) -@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) - -# with iterator -cm_missing = countmap(skipmissing(xx)) -@test cm_missing isa Dict{Int, Int} -@test cm_missing == cm - -cm_any_itr = countmap((i for i in xx)) -@test cm_any_itr isa Dict{Any,Int} # no knowledge about type -@test cm_missing == cm - -# with empty array -@test countmap(Int[]) == Dict{Int, Int}() - -# testing the radixsort-based addcounts -xx = repeat([6, 1, 3, 1], outer=100_000) -cm = Dict{Int, Int}() -StatsBase.addcounts_radixsort!(cm,xx) -@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) -xx2 = repeat([7, 1, 3, 1], outer=100_000) -StatsBase.addcounts_radixsort!(cm,xx2) -@test cm == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) -# with iterator -cm_missing = Dict{Int, Int}() -StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx)) -@test cm_missing == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) -StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx2)) -@test cm_missing == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) - -# testing the Dict-based addcounts -cm = Dict{Int, Int}() -cm_itr = Dict{Int, Int}() -StatsBase.addcounts_dict!(cm,xx) -StatsBase.addcounts_dict!(cm_itr,skipmissing(xx)) -@test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) -@test cm_itr isa Dict{Int, Int} - -cm = countmap(x, weights(w)) -@test cm["a"] == 5.5 -@test cm["b"] == 4.5 -@test cm["c"] == 3.5 - -@test cm == countmap(x, w) - -pm = proportionmap(x, weights(w)) -@test pm["a"] ≈ (5.5 / 13.5) -@test pm["b"] ≈ (4.5 / 13.5) -@test pm["c"] ≈ (3.5 / 13.5) - -# testing small bits type -bx = [true, false, true, true, false] -cm_bx_missing = countmap(skipmissing(bx)) -@test cm_bx_missing == countmap(bx) == Dict(true => 3, false => 2) -@test cm_bx_missing isa Dict{Bool, Int} - -for T in [UInt8, UInt16, Int8, Int16] - tx = T[typemin(T), 8, typemax(T), 19, 8] - tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8]) - cm_tx_missing = countmap(tx_missing) - @test cm_tx_missing == countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) - @test cm_tx_missing isa Dict{T, Int} +@testset "count map" begin + x = ["a", "b", "a", "a", "b", "c"] + w = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5] + + cm = countmap(x) + @test cm["a"] == 3 + @test cm["b"] == 2 + @test cm["c"] == 1 + + # iterator, non-radixsort + cm_missing = countmap(skipmissing(x)) + cm_any_itr = countmap((i for i in x)) + @test cm_missing == cm_any_itr == cm + @test cm_missing isa Dict{String, Int} + @test cm_any_itr isa Dict{Any, Int} + + pm = proportionmap(x) + @test pm["a"] ≈ (1/2) + @test pm["b"] ≈ (1/3) + @test pm["c"] ≈ (1/6) + + + # testing the radixsort branch of countmap + xx = repeat([6, 1, 3, 1], outer=100_000) + cm = countmap(xx) + @test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) + + # with iterator + cm_missing = countmap(skipmissing(xx)) + @test cm_missing isa Dict{Int, Int} + @test cm_missing == cm + + cm_any_itr = countmap((i for i in xx)) + @test cm_any_itr isa Dict{Any,Int} # no knowledge about type + @test cm_missing == cm + + # with empty array + @test countmap(Int[]) == Dict{Int, Int}() + + # testing the radixsort-based addcounts + xx = repeat([6, 1, 3, 1], outer=100_000) + cm = Dict{Int, Int}() + StatsBase.addcounts_radixsort!(cm,xx) + @test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) + xx2 = repeat([7, 1, 3, 1], outer=100_000) + StatsBase.addcounts_radixsort!(cm,xx2) + @test cm == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) + # with iterator + cm_missing = Dict{Int, Int}() + StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx)) + @test cm_missing == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) + StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx2)) + @test cm_missing == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) + + # testing the Dict-based addcounts + cm = Dict{Int, Int}() + cm_itr = Dict{Int, Int}() + StatsBase.addcounts_dict!(cm,xx) + StatsBase.addcounts_dict!(cm_itr,skipmissing(xx)) + @test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) + @test cm_itr isa Dict{Int, Int} + + cm = countmap(x, weights(w)) + @test cm["a"] == 5.5 + @test cm["b"] == 4.5 + @test cm["c"] == 3.5 + + @test cm == countmap(x, w) + + pm = proportionmap(x, weights(w)) + @test pm["a"] ≈ (5.5 / 13.5) + @test pm["b"] ≈ (4.5 / 13.5) + @test pm["c"] ≈ (3.5 / 13.5) + + # testing small bits type + bx = [true, false, true, true, false] + cm_bx_missing = countmap(skipmissing(bx)) + @test cm_bx_missing == countmap(bx) == Dict(true => 3, false => 2) + @test cm_bx_missing isa Dict{Bool, Int} + + for T in [UInt8, UInt16, Int8, Int16] + tx = T[typemin(T), 8, typemax(T), 19, 8] + tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8]) + cm_tx_missing = countmap(tx_missing) + @test cm_tx_missing == countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) + @test cm_tx_missing isa Dict{T, Int} + end end @testset "views" begin @@ -181,9 +191,9 @@ end @test proportions(x, xw) == proportions(y, yw) == proportions(z, zw) @test proportionmap(x) == proportionmap(y) == proportionmap(z) @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) - @test (proportionmap(x) == proportionmap(x; alg = :dict) == proportionmap(x; alg = :radixsort) - == proportionmap(y) == proportionmap(y; alg = :dict) == proportionmap(y; alg = :radixsort) - == proportionmap(z) == proportionmap(z; alg = :dict) == proportionmap(z; alg = :radixsort)) + @test (countmap(x) == countmap(x; alg = :dict) == countmap(x; alg = :radixsort) + == countmap(y) == countmap(y; alg = :dict) == countmap(y; alg = :radixsort) + == countmap(z) == countmap(z; alg = :dict) == countmap(z; alg = :radixsort)) @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) # countmap and proportionmap only support the :dict algorithm for weighted sums. end From aa21bc914c3d8e9e2a7bd7d0e54c5c837464fc62 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Mon, 11 Oct 2021 09:59:53 -0500 Subject: [PATCH 09/43] removed todo list --- test/counts.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index f81f2ea9a..feac99ccf 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -2,9 +2,6 @@ using StatsBase using Test using OffsetArrays -#TODO: firstindex -> lastindex -#TEST multidimensional input on radix (and dict) - n = 5000 _reshape(x) = reshape(x, 10, 50, 10) @@ -39,7 +36,7 @@ _reshape(x) = reshape(x, 10, 50, 10) @test proportions(x, w) ≈ (c0 ./ sum(w)) @test counts(_reshape(x), w) ≈ c0 # Perhaps this should not be allowed - #addcounts! to row vector + #addcounts! to row matrix c0 = reshape(c0, 1, 5) @test addcounts!(fill(0.0, 1, 5), x, 1:5, w) ≈ c0 @test addcounts!(fill(0.0, 1, 5), _reshape(x), 1:5, w) ≈ c0 # Perhaps this should not be allowed From 309586f0dd7d100254c5ce2da4ee26919c58a8a8 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Mon, 11 Oct 2021 12:43:43 -0500 Subject: [PATCH 10/43] Update src/counts.jl --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 4b74b8597..fc2064858 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -330,7 +330,7 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128, "Can the type be safely sorted by radixsort" radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes -function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T +function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractVector{T}) where T isempty(sx) && return cm last_sx = first(sx) start_i = firstindex(sx)::Integer From 3b5e01d104dd1a5ef81175972bd875353752ec45 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Mon, 11 Oct 2021 12:57:33 -0500 Subject: [PATCH 11/43] test :radixsort multidimensional array --- test/counts.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/counts.jl b/test/counts.jl index feac99ccf..7c26cda1f 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -116,6 +116,10 @@ end @test cm_any_itr isa Dict{Any,Int} # no knowledge about type @test cm_missing == cm + # with multidimensional array + cm_reshape = countmap(reshape(xx, 20, 100, 20, 10); alg=:radixsort) + @test cm_reshape == cm + # with empty array @test countmap(Int[]) == Dict{Int, Int}() From 3384f45fc9b064ccb278e283bcc2fee399ab99b7 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Mon, 11 Oct 2021 16:46:19 -0500 Subject: [PATCH 12/43] docstrings: homogonize, correct, explain proportionmap, shrink oneline descriptions --- src/counts.jl | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index fc2064858..f3e4da72c 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -91,8 +91,10 @@ counts(x::IntegerArray, wv::AbstractWeights) = counts(x, span(x), wv) proportions(x, levels=span(x), [wv::AbstractWeights]) Return the proportion of values in the range `levels` that occur in `x`. -Equivalent to `counts(x, levels) / length(x)`. If a weighting vector `wv` -is specified, the sum of the weights is used rather than the raw counts. +Equivalent to `counts(x, levels) / length(x)`. + +If a weighting vector `wv` is specified, the sum of the weights is used rather than the +raw counts. """ proportions(x::IntegerArray, levels::IntUnitRange) = counts(x, levels) .* inv(length(x)) proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = @@ -102,6 +104,9 @@ proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = proportions(x, k::Integer, [wv::AbstractWeights]) Return the proportion of integers in 1 to `k` that occur in `x`. + +If a weighting vector `wv` is specified, the sum of the weights is used rather than the +raw counts. """ proportions(x::IntegerArray, k::Integer) = proportions(x, 1:k) proportions(x::IntegerArray, k::Integer, wv::AbstractWeights) = proportions(x, 1:k, wv) @@ -230,13 +235,15 @@ end """ - addcounts!(dict, x[, wv]; alg = :auto) + addcounts!(dict, x; alg = :auto) + addcounts!(dict, x, wv) Add counts based on `x` to a count map. New entries will be added if new values come up. + If a weighting vector `wv` is specified, the sum of the weights is used rather than the raw counts. -`alg` can be one of: +`alg` is only allowed when `wv` is not specified and can be one of: - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use `:radixsort`, otherwise use `:dict`. @@ -383,11 +390,14 @@ end """ countmap(x; alg = :auto) - countmap(x::AbstractVector, w::AbstractVector{<:Real}; alg = :auto) + countmap(x::AbstractVector, w::AbstractVector{<:Real}) + +Return a dictionary mapping each unique value in `x` to its number of occurrences. -Return a dictionary mapping each unique value in `x` to its number -of occurrences. A vector of weights `w` can be provided when `x` is a vector. +When `x` is a vector, a vector of weights `wv` can be provided and the sum of the weights +is used rather than the raw counts. +`alg` is only allowed for unweighted counting and can be one of: - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use `:radixsort`, otherwise use `:dict`. @@ -406,10 +416,27 @@ countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcoun """ - proportionmap(x) + proportionmap(x; alg = :auto) + proportionmap(x::AbstractVector, w::AbstractVector{<:Real}) + +Return a dictionary mapping each unique value in `x` to its proportion in `x`. + +When `x` is a vector, a vector of weights `wv` can be provided and the sum of the weights +is used rather than the raw counts. + +`alg` is only allowed for unweighted counting and can be one of: +- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use + `:radixsort`, otherwise use `:dict`. -Return a dictionary mapping each unique value in `x` to its -proportion in `x`. +- `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the + [radix sort](https://en.wikipedia.org/wiki/Radix_sort) + algorithm to sort the input vector which will generally lead to + shorter running time. However the radix sort algorithm creates a + copy of the input vector and hence uses more RAM. Choose `:dict` + if the amount of available RAM is a limitation. + +- `:dict`: use `Dict`-based method which is generally slower but uses less + RAM and is safe for any data type. """ proportionmap(x::AbstractArray; alg = :auto) = _normalize_countmap(countmap(x; alg = alg), length(x)) proportionmap(x::AbstractArray, wv::AbstractWeights) = _normalize_countmap(countmap(x, wv), sum(wv)) From 6b81b514fa10688c7e7ce52e5481c02d12404f6b Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Mon, 11 Oct 2021 17:03:57 -0500 Subject: [PATCH 13/43] whitespace --- src/counts.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index f3e4da72c..dc32434d9 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -118,12 +118,10 @@ proportions(x::IntegerArray, wv::AbstractWeights) = proportions(x, span(x), wv) function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}) # add counts of integers from x to r - xlevels, ylevels = levels @boundscheck checkbounds(r, axes(xlevels, 1), axes(ylevels, 1)) - mx0 = first(xlevels) mx1 = last(xlevels) my0 = first(ylevels) @@ -376,7 +374,6 @@ function addcounts_radixsort!(cm::Dict{T}, x) where T end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} - z = zero(W) for i in eachindex(x, wv) From 18b70b5aa586d8d6167cb496bc746670c92418e0 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Tue, 12 Oct 2021 16:23:35 -0500 Subject: [PATCH 14/43] test axes(::Weights) --- test/weights.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/weights.jl b/test/weights.jl index 7735e04f7..f833d878e 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -13,6 +13,7 @@ weight_funcs = (weights, aweights, fweights, pweights) @test isempty(f(Float64[])) @test size(f([1, 2, 3])) == (3,) + @test axes(f([1, 2, 3])) == (Base.OneTo(3),) w = [1., 2., 3.] wv = f(w) From 93caf72b331eab1d57cbe1cf9f350f55dab50c41 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Wed, 13 Oct 2021 23:59:28 -0500 Subject: [PATCH 15/43] fix tests --- Project.toml | 3 ++- src/weights.jl | 1 + test/counts.jl | 1 + test/weights.jl | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index b8bc3c96a..19c7a8423 100644 --- a/Project.toml +++ b/Project.toml @@ -26,10 +26,11 @@ StatsAPI = "1" julia = "1" [extras] +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Dates", "DelimitedFiles", "StableRNGs", "Test"] +test = ["OffsetArrays", "Dates", "DelimitedFiles", "StableRNGs", "Test"] diff --git a/src/weights.jl b/src/weights.jl index af509722e..6bf1038ec 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -277,6 +277,7 @@ sum(wv::UnitWeights{T}) where T = convert(T, length(wv)) isempty(wv::UnitWeights) = iszero(wv.len) length(wv::UnitWeights) = wv.len size(wv::UnitWeights) = tuple(length(wv)) +Base.axes(wv::UnitWeights) = tuple(Base.OneTo(length(wv))) Base.convert(::Type{Vector}, wv::UnitWeights{T}) where {T} = ones(T, length(wv)) diff --git a/test/counts.jl b/test/counts.jl index 7c26cda1f..91df8fecf 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -49,6 +49,7 @@ end @testset "2D integer counts" begin x = rand(1:4, n) y = rand(1:5, n) + w = weights(rand(n)) x0 = deepcopy(x) y0 = deepcopy(y) w0 = deepcopy(w) diff --git a/test/weights.jl b/test/weights.jl index f833d878e..86d8444de 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -106,6 +106,7 @@ end @test !isempty(wv) @test length(wv) === 3 @test size(wv) === (3,) + @test axes(wv) === (Base.OneTo(3),) @test sum(wv) === 3. @test wv == fill(1.0, 3) @test StatsBase.varcorrection(wv) == 1/3 From 62c5967bd8c92d631641e387f696f252c3de5c73 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Thu, 14 Oct 2021 16:18:42 -0500 Subject: [PATCH 16/43] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- Project.toml | 4 ++-- src/counts.jl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index 19c7a8423..62b5130d0 100644 --- a/Project.toml +++ b/Project.toml @@ -26,11 +26,11 @@ StatsAPI = "1" julia = "1" [extras] -OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["OffsetArrays", "Dates", "DelimitedFiles", "StableRNGs", "Test"] +test = ["Dates", "DelimitedFiles", "OffsetArrays", "StableRNGs", "Test"] diff --git a/src/counts.jl b/src/counts.jl index dc32434d9..917786c5d 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -241,7 +241,7 @@ Add counts based on `x` to a count map. New entries will be added if new values If a weighting vector `wv` is specified, the sum of the weights is used rather than the raw counts. -`alg` is only allowed when `wv` is not specified and can be one of: +`alg` is only allowed for unweighted counting and can be one of: - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use `:radixsort`, otherwise use `:dict`. @@ -338,7 +338,7 @@ radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractVector{T}) where T isempty(sx) && return cm last_sx = first(sx) - start_i = firstindex(sx)::Integer + start_i = firstindex(sx) # now the data is sorted: can just run through and accumulate values before # adding into the Dict From 1238a2d8cfbcaa293971e2a1a7e9a790cec177db Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Sat, 23 Oct 2021 08:32:48 -0500 Subject: [PATCH 17/43] put back dimension-mismatch (add messages) --- src/counts.jl | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 917786c5d..96e312ae5 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -44,6 +44,9 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv: x = vec(x) # discard shape because weights() discards shape + axes(x) == axes(wv) || throw(DimensionMismatch( + "vec(x) and wv must have the same axes, got $(axes(x)) and $(axes(wv))")) + @boundscheck checkbounds(r, axes(levels)...) m0 = first(levels) @@ -116,11 +119,15 @@ proportions(x::IntegerArray, wv::AbstractWeights) = proportions(x, span(x), wv) #### functions for counting a single list of integers (2D) function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}) - # add counts of integers from x to r + # add counts of pairs from zip(x,y) to r xlevels, ylevels = levels - @boundscheck checkbounds(r, axes(xlevels, 1), axes(ylevels, 1)) + axes(x) == axes(y) || throw(DimensionMismatch( + "x and y must have the same axes, got $(axes(x)) and $(axes(y))")) + + axes(r) == (axes(xlevels, 1), axes(ylevels, 1)) || throw(DimensionMismatch( + "axes(r) must correspond to the axes of levels, got $(axes(r)) ≠ $(axes(xlevels, 1), axes(ylevels, 1))")) mx0 = first(xlevels) mx1 = last(xlevels) @@ -142,13 +149,20 @@ end function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) - # add counts of integers from x to r + # add counts of pairs from zip(x,y) to r + + axes(x) == axes(y) || throw(DimensionMismatch( + "x and y must have the same axes, got $(axes(x)) and $(axes(y))")) x, y = vec(x), vec(y) # discard shape because weights() discards shape + axes(x) == axes(y) == axes(wv) || throw(DimensionMismatch( + "vec(x), vec(y), and wv must have the same axes, got $(axes(x)), $(axes(y)), and $(axes(wv))")) + xlevels, ylevels = levels - @boundscheck checkbounds(r, axes(xlevels, 1), axes(ylevels, 1)) + axes(r) == (axes(xlevels, 1), axes(ylevels, 1)) || throw(DimensionMismatch( + "axes(r) must correspond to the axes of levels, got $(axes(r)) ≠ $(axes(xlevels, 1), axes(ylevels, 1))")) mx0 = first(xlevels) mx1 = last(xlevels) @@ -374,6 +388,12 @@ function addcounts_radixsort!(cm::Dict{T}, x) where T end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} + + x = vec(x) # discard shape because weights() discards shape + + axes(x) == axes(wv) || throw(DimensionMismatch( + "vec(x) and wv must have the same axes, got $(axes(x)) and $(axes(wv))")) + z = zero(W) for i in eachindex(x, wv) From c1c09d179babe5ee54c6d91d0b672aeeb862fab8 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 08:55:36 -0500 Subject: [PATCH 18/43] test formatting --- test/counts.jl | 83 +++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index 91df8fecf..9f09a91d9 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -3,7 +3,6 @@ using Test using OffsetArrays n = 5000 -_reshape(x) = reshape(x, 10, 50, 10) @testset "1D integer counts" begin x = rand(1:5, n) @@ -12,34 +11,34 @@ _reshape(x) = reshape(x, 10, 50, 10) w0 = deepcopy(w) c0 = Int[count(v->v == i, x) for i in 1:5] - @test counts(x, 5) == c0 - @test counts(x .+ 1, 2:6) == c0 - @test proportions(x, 1:5) ≈ (c0 ./ n) - @test counts(_reshape(x), 5) == c0 + @test counts(x, 5) == c0 + @test counts(x .+ 1, 2:6) == c0 + @test proportions(x, 1:5) ≈ (c0 ./ n) + @test counts(reshape(x, 10, 50, 10), 5) == c0 - @test counts(x) == c0 - @test proportions(x) ≈ (c0 ./ n) - @test counts(_reshape(x)) == c0 + @test counts(x) == c0 + @test proportions(x) ≈ (c0 ./ n) + @test counts(reshape(x, 10, 50, 10)) == c0 c0 = reshape(c0, 1, 5) - @test addcounts!(fill(0, 1, 5), x, 1:5) == c0 - @test addcounts!(fill(0, 1, 5), _reshape(x), 1:5) == c0 + @test addcounts!(fill(0, 1, 5), x, 1:5) == c0 + @test addcounts!(fill(0, 1, 5), reshape(x, 10, 50, 10), 1:5) == c0 c0 = Float64[sum(w.values[x .== i]) for i in 1:5] - @test counts(x, 5, w) ≈ c0 - @test counts(x .+ 1, 2:6, w) ≈ c0 - @test proportions(x, 1:5, w) ≈ (c0 ./ sum(w)) - @test counts(_reshape(x), 5, w) ≈ c0 # Perhaps this should not be allowed + @test counts(x, 5, w) ≈ c0 + @test counts(x .+ 1, 2:6, w) ≈ c0 + @test proportions(x, 1:5, w) ≈ (c0 ./ sum(w)) + @test counts(reshape(x, 10, 50, 10), 5, w) ≈ c0 # Perhaps this should not be allowed - @test counts(x, w) ≈ c0 - @test counts(x .+ 1, 2:6, w) ≈ c0 - @test proportions(x, w) ≈ (c0 ./ sum(w)) - @test counts(_reshape(x), w) ≈ c0 # Perhaps this should not be allowed + @test counts(x, w) ≈ c0 + @test counts(x .+ 1, 2:6, w) ≈ c0 + @test proportions(x, w) ≈ (c0 ./ sum(w)) + @test counts(reshape(x, 10, 50, 10), w) ≈ c0 # Perhaps this should not be allowed #addcounts! to row matrix c0 = reshape(c0, 1, 5) - @test addcounts!(fill(0.0, 1, 5), x, 1:5, w) ≈ c0 - @test addcounts!(fill(0.0, 1, 5), _reshape(x), 1:5, w) ≈ c0 # Perhaps this should not be allowed + @test addcounts!(fill(0.0, 1, 5), x, 1:5, w) ≈ c0 + @test addcounts!(fill(0.0, 1, 5), reshape(x, 10, 50, 10), 1:5, w) ≈ c0 # Perhaps this should not be allowed @test x == x0 @test w == w0 @@ -55,26 +54,26 @@ end w0 = deepcopy(w) c0 = Int[count(t->t != 0, (x .== i) .& (y .== j)) for i in 1:4, j in 1:5] - @test counts(x, y, (4, 5)) == c0 - @test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 - @test proportions(x, y, (1:4, 1:5)) ≈ (c0 ./ n) - @test counts(_reshape(x), _reshape(y), (4, 5)) == c0 + @test counts(x, y, (4, 5)) == c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 + @test proportions(x, y, (1:4, 1:5)) ≈ (c0 ./ n) + @test counts(reshape(x, 10, 50, 10), reshape(y, 10, 50, 10), (4, 5)) == c0 - @test counts(x, y) == c0 - @test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 - @test proportions(x, y,) ≈ (c0 ./ n) - @test counts(_reshape(x), _reshape(y)) == c0 + @test counts(x, y) == c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8)) == c0 + @test proportions(x, y,) ≈ (c0 ./ n) + @test counts(reshape(x, 10, 50, 10), reshape(y, 10, 50, 10)) == c0 c0 = Float64[sum(w.values[(x .== i) .& (y .== j)]) for i in 1:4, j in 1:5] - @test counts(x, y, (4, 5), w) ≈ c0 - @test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 - @test proportions(x, y, (1:4, 1:5), w) ≈ (c0 ./ sum(w)) - @test counts(_reshape(x), _reshape(y), (4, 5), w) ≈ c0 # Perhaps this should not be allowed + @test counts(x, y, (4, 5), w) ≈ c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 + @test proportions(x, y, (1:4, 1:5), w) ≈ (c0 ./ sum(w)) + @test counts(reshape(x, 10, 50, 10), reshape(y, 10, 50, 10), (4, 5), w) ≈ c0 # Perhaps this should not be allowed - @test counts(x, y, w) ≈ c0 - @test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 - @test proportions(x, y, w) ≈ (c0 ./ sum(w)) - @test counts(_reshape(x), _reshape(y), w) ≈ c0 # Perhaps this should not be allowed + @test counts(x, y, w) ≈ c0 + @test counts(x .+ 2, y .+ 3, (3:6, 4:8), w) ≈ c0 + @test proportions(x, y, w) ≈ (c0 ./ sum(w)) + @test counts(reshape(x, 10, 50, 10), reshape(y, 10, 50, 10), w) ≈ c0 # Perhaps this should not be allowed @test x == x0 @test y == y0 @@ -189,13 +188,13 @@ end zw = weights(OffsetArray(w, -2n)) # proportions calls counts which calls addcounts! - @test proportions(x) == proportions(y) == proportions(z) - @test proportions(x, xw) == proportions(y, yw) == proportions(z, zw) - @test proportionmap(x) == proportionmap(y) == proportionmap(z) + @test proportions(x) == proportions(y) == proportions(z) + @test proportions(x, xw) == proportions(y, yw) == proportions(z, zw) + @test proportionmap(x) == proportionmap(y) == proportionmap(z) @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) - @test (countmap(x) == countmap(x; alg = :dict) == countmap(x; alg = :radixsort) - == countmap(y) == countmap(y; alg = :dict) == countmap(y; alg = :radixsort) - == countmap(z) == countmap(z; alg = :dict) == countmap(z; alg = :radixsort)) + @test (countmap(x) == countmap(x; alg = :dict) == countmap(x; alg = :radixsort) == + countmap(y) == countmap(y; alg = :dict) == countmap(y; alg = :radixsort) == + countmap(z) == countmap(z; alg = :dict) == countmap(z; alg = :radixsort)) @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) # countmap and proportionmap only support the :dict algorithm for weighted sums. end From bc20432ddffc28c2064ce59e00afd29002920956 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 08:58:42 -0500 Subject: [PATCH 19/43] better error messages --- src/counts.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 96e312ae5..cc32a953a 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -44,8 +44,8 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv: x = vec(x) # discard shape because weights() discards shape - axes(x) == axes(wv) || throw(DimensionMismatch( - "vec(x) and wv must have the same axes, got $(axes(x)) and $(axes(wv))")) + lenth(x) == lenth(wv) || throw(DimensionMismatch( + "x and wv must have the same length, got $(length(x)) and $(length(wv))")) @boundscheck checkbounds(r, axes(levels)...) @@ -156,8 +156,8 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, x, y = vec(x), vec(y) # discard shape because weights() discards shape - axes(x) == axes(y) == axes(wv) || throw(DimensionMismatch( - "vec(x), vec(y), and wv must have the same axes, got $(axes(x)), $(axes(y)), and $(axes(wv))")) + length(x) == length(y) == length(wv) || throw(DimensionMismatch( + "x, y, and wv must have the same length, got $(length(x)), $(length(y)), and $(length(wv))")) xlevels, ylevels = levels @@ -391,8 +391,8 @@ function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) whe x = vec(x) # discard shape because weights() discards shape - axes(x) == axes(wv) || throw(DimensionMismatch( - "vec(x) and wv must have the same axes, got $(axes(x)) and $(axes(wv))")) + length(x) == length(wv) || throw(DimensionMismatch( + "x and wv must have the same length, got $(length(x)) and $(length(wv))")) z = zero(W) From cbfd92e73a3ea33f305fa0d93548e14e69c84540 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 09:03:42 -0500 Subject: [PATCH 20/43] test :dict on reshape as well --- test/counts.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index 9f09a91d9..fdeed7986 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -117,8 +117,8 @@ end @test cm_missing == cm # with multidimensional array - cm_reshape = countmap(reshape(xx, 20, 100, 20, 10); alg=:radixsort) - @test cm_reshape == cm + @test countmap(reshape(xx, 20, 100, 20, 10); alg=:radixsort) == cm + @test countmap(reshape(xx, 20, 100, 20, 10); alg=:dict) == cm # with empty array @test countmap(Int[]) == Dict{Int, Int}() From 60c0aad7fd350ef2fb9c8a8f6f5eff070a6e0b91 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 09:18:01 -0500 Subject: [PATCH 21/43] typos (fix tests) --- src/counts.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index cc32a953a..6c8753962 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -44,7 +44,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv: x = vec(x) # discard shape because weights() discards shape - lenth(x) == lenth(wv) || throw(DimensionMismatch( + length(x) == length(wv) || throw(DimensionMismatch( "x and wv must have the same length, got $(length(x)) and $(length(wv))")) @boundscheck checkbounds(r, axes(levels)...) @@ -127,7 +127,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels:: "x and y must have the same axes, got $(axes(x)) and $(axes(y))")) axes(r) == (axes(xlevels, 1), axes(ylevels, 1)) || throw(DimensionMismatch( - "axes(r) must correspond to the axes of levels, got $(axes(r)) ≠ $(axes(xlevels, 1), axes(ylevels, 1))")) + "axes(r) must correspond to the axes of levels, got $(axes(r)) ≠ $(axes(xlevels, 1)), $(axes(ylevels, 1))")) mx0 = first(xlevels) mx1 = last(xlevels) @@ -162,7 +162,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, xlevels, ylevels = levels axes(r) == (axes(xlevels, 1), axes(ylevels, 1)) || throw(DimensionMismatch( - "axes(r) must correspond to the axes of levels, got $(axes(r)) ≠ $(axes(xlevels, 1), axes(ylevels, 1))")) + "axes(r) must correspond to the axes of levels, got $(axes(r)) ≠ $(axes(xlevels, 1)), $(axes(ylevels, 1))")) mx0 = first(xlevels) mx1 = last(xlevels) From c72b1aafefefb4f1a61e606e03d52f89c3e0bd7f Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 09:18:35 -0500 Subject: [PATCH 22/43] fixed typo in tests that stopped a test from running --- test/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/counts.jl b/test/counts.jl index fdeed7986..b29ee41c1 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -114,7 +114,7 @@ end cm_any_itr = countmap((i for i in xx)) @test cm_any_itr isa Dict{Any,Int} # no knowledge about type - @test cm_missing == cm + @test cm_any_itr == cm # with multidimensional array @test countmap(reshape(xx, 20, 100, 20, 10); alg=:radixsort) == cm From e4ca2648a13a6040adf51b2fcd7bfc848cbfa20c Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 16:53:24 -0500 Subject: [PATCH 23/43] conform to nalimilan's style --- src/counts.jl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 6c8753962..582b04cda 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -123,11 +123,11 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels:: xlevels, ylevels = levels - axes(x) == axes(y) || throw(DimensionMismatch( - "x and y must have the same axes, got $(axes(x)) and $(axes(y))")) + axes(x) == axes(y) || + throw(DimensionMismatch("x and y must have the same axes, but got $(axes(x)) and $(axes(y))")) - axes(r) == (axes(xlevels, 1), axes(ylevels, 1)) || throw(DimensionMismatch( - "axes(r) must correspond to the axes of levels, got $(axes(r)) ≠ $(axes(xlevels, 1)), $(axes(ylevels, 1))")) + axes(r) == (axes(xlevels, 1), axes(ylevels, 1)) || + throw(DimensionMismatch("axes(r) must correspond to the axes of levels, but got $(axes(r)) ≠ $(axes(xlevels, 1)), $(axes(ylevels, 1))")) mx0 = first(xlevels) mx1 = last(xlevels) @@ -151,13 +151,13 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) # add counts of pairs from zip(x,y) to r - axes(x) == axes(y) || throw(DimensionMismatch( - "x and y must have the same axes, got $(axes(x)) and $(axes(y))")) + axes(x) == axes(y) || + throw(DimensionMismatch("x and y must have the same axes, but got $(axes(x)) and $(axes(y))")) x, y = vec(x), vec(y) # discard shape because weights() discards shape - length(x) == length(y) == length(wv) || throw(DimensionMismatch( - "x, y, and wv must have the same length, got $(length(x)), $(length(y)), and $(length(wv))")) + length(x) == length(y) == length(wv) || + throw(DimensionMismatch("x, y, and wv must have the same length, but got $(length(x)), $(length(y)), and $(length(wv))")) xlevels, ylevels = levels @@ -391,8 +391,8 @@ function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) whe x = vec(x) # discard shape because weights() discards shape - length(x) == length(wv) || throw(DimensionMismatch( - "x and wv must have the same length, got $(length(x)) and $(length(wv))")) + length(x) == length(wv) || + throw(DimensionMismatch("x and wv must have the same length, got $(length(x)) and $(length(wv))")) z = zero(W) From 00fe4083fc13016f0b76a101d8ea3cf9f0865b05 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 16:57:42 -0500 Subject: [PATCH 24/43] whitespace Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 582b04cda..398f37e5d 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -388,7 +388,6 @@ function addcounts_radixsort!(cm::Dict{T}, x) where T end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} - x = vec(x) # discard shape because weights() discards shape length(x) == length(wv) || From a4705319822efc25e9dc1103630c9497c41540e0 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 17:24:52 -0500 Subject: [PATCH 25/43] rename x -> xv to avoid type instability --- src/counts.jl | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 398f37e5d..0e944a68d 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -42,19 +42,19 @@ end function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) # add wv weighted counts of integers from x that fall within levels to r - x = vec(x) # discard shape because weights() discards shape - length(x) == length(wv) || throw(DimensionMismatch( "x and wv must have the same length, got $(length(x)) and $(length(wv))")) + xv = vec(x) # discard shape because weights() discards shape + @boundscheck checkbounds(r, axes(levels)...) m0 = first(levels) m1 = last(levels) b = m0 - 1 - @inbounds for i in eachindex(x, wv) - xi = x[i] + @inbounds for i in eachindex(xv, wv) + xi = xv[i] if m0 <= xi <= m1 r[xi - b] += wv[i] end @@ -151,13 +151,13 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) # add counts of pairs from zip(x,y) to r + length(x) == length(y) == length(wv) || + throw(DimensionMismatch("x, y, and wv must have the same length, but got $(length(x)), $(length(y)), and $(length(wv))")) + axes(x) == axes(y) || throw(DimensionMismatch("x and y must have the same axes, but got $(axes(x)) and $(axes(y))")) - x, y = vec(x), vec(y) # discard shape because weights() discards shape - - length(x) == length(y) == length(wv) || - throw(DimensionMismatch("x, y, and wv must have the same length, but got $(length(x)), $(length(y)), and $(length(wv))")) + xv, yv = vec(x), vec(y) # discard shape because weights() discards shape xlevels, ylevels = levels @@ -172,9 +172,9 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, bx = mx0 - 1 by = my0 - 1 - for i in eachindex(x, y, wv) - xi = x[i] - yi = y[i] + for i in eachindex(xv, yv, wv) + xi = xv[i] + yi = yv[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) r[xi - bx, yi - by] += wv[i] end @@ -388,15 +388,17 @@ function addcounts_radixsort!(cm::Dict{T}, x) where T end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} - x = vec(x) # discard shape because weights() discards shape + # add wv weighted counts of integers from x to cm length(x) == length(wv) || throw(DimensionMismatch("x and wv must have the same length, got $(length(x)) and $(length(wv))")) + xv = vec(x) # discard shape because weights() discards shape + z = zero(W) - for i in eachindex(x, wv) - @inbounds xi = x[i] + for i in eachindex(xv, wv) + @inbounds xi = xv[i] @inbounds wi = wv[i] cm[xi] = get(cm, xi, z) + wi end From aca38a31a80561e57cc0fd10659fa6bca1bbfa99 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 17:47:30 -0500 Subject: [PATCH 26/43] test multidimensional countmap input with vector weights --- test/counts.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/counts.jl b/test/counts.jl index b29ee41c1..46d4d60f7 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -152,6 +152,7 @@ end @test cm["c"] == 3.5 @test cm == countmap(x, w) + @test cm == countmap(reshape(x, 2, 3), w) pm = proportionmap(x, weights(w)) @test pm["a"] ≈ (5.5 / 13.5) From 168b3ab66b062c64e9081937ef149b75c6665f10 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 18:01:07 -0500 Subject: [PATCH 27/43] docstring remove 'when x is a vector' --- src/counts.jl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 0e944a68d..5affafb3a 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -19,8 +19,10 @@ end addcounts!(r, x, levels::UnitRange{<:Int}, [wv::AbstractWeights]) Add the number of occurrences in `x` of each value in `levels` to an existing -array `r`. If a weighting vector `wv` is specified, the sum of weights is used -rather than the raw counts. +array `r`. + +If a weighting vector `wv` is specified, the sum of weights is used rather than the +raw counts. """ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange) # add counts of integers from x that fall within levels to r @@ -73,7 +75,7 @@ falling in that range will be considered (the others will be ignored without raising an error or a warning). If an integer `k` is provided, only values in the range `1:k` will be considered. -If a weighting vector `wv` is specified, the sum of the weights is used rather than the +If a weighting vector `wv` is specified, the proportion of weights is used rather than the raw counts. The output is a vector of length `length(levels)`. @@ -412,7 +414,7 @@ end Return a dictionary mapping each unique value in `x` to its number of occurrences. -When `x` is a vector, a vector of weights `wv` can be provided and the sum of the weights +A vector of weights `wv` can be provided and the sum of the weights is used rather than the raw counts. `alg` is only allowed for unweighted counting and can be one of: @@ -439,8 +441,8 @@ countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcoun Return a dictionary mapping each unique value in `x` to its proportion in `x`. -When `x` is a vector, a vector of weights `wv` can be provided and the sum of the weights -is used rather than the raw counts. +If a vector of weights `wv` is provided, the weighted proportion is computed rather +than the proportion of raw counts. `alg` is only allowed for unweighted counting and can be one of: - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use From 204f89af54d710439d1a07cd1b954ff7454127e0 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sat, 23 Oct 2021 18:07:21 -0500 Subject: [PATCH 28/43] sum of weights to proportion of weights in docstrings where applicable --- src/counts.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 5affafb3a..627ccd502 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -98,7 +98,7 @@ counts(x::IntegerArray, wv::AbstractWeights) = counts(x, span(x), wv) Return the proportion of values in the range `levels` that occur in `x`. Equivalent to `counts(x, levels) / length(x)`. -If a weighting vector `wv` is specified, the sum of the weights is used rather than the +If a weighting vector `wv` is specified, the proportion of weight is used rather than the raw counts. """ proportions(x::IntegerArray, levels::IntUnitRange) = counts(x, levels) .* inv(length(x)) @@ -110,7 +110,7 @@ proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = Return the proportion of integers in 1 to `k` that occur in `x`. -If a weighting vector `wv` is specified, the sum of the weights is used rather than the +If a weighting vector `wv` is specified, the proportion of weight is used rather than the raw counts. """ proportions(x::IntegerArray, k::Integer) = proportions(x, 1:k) @@ -414,8 +414,8 @@ end Return a dictionary mapping each unique value in `x` to its number of occurrences. -A vector of weights `wv` can be provided and the sum of the weights -is used rather than the raw counts. +A vector of weights `wv` can be provided and the sum of the weights is used rather than the +raw counts. `alg` is only allowed for unweighted counting and can be one of: - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use From 3f3e6573f23cfffbda7928dedbfa13373b9eb18f Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sun, 24 Oct 2021 15:00:41 -0500 Subject: [PATCH 29/43] use nalimilan's word choice Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 627ccd502..c37c1af5a 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -75,8 +75,8 @@ falling in that range will be considered (the others will be ignored without raising an error or a warning). If an integer `k` is provided, only values in the range `1:k` will be considered. -If a weighting vector `wv` is specified, the proportion of weights is used rather than the -raw counts. +If a vector of weights `wv` is provided, the proportion of weights is computed rather +than the proportion of raw counts. The output is a vector of length `length(levels)`. """ @@ -98,8 +98,8 @@ counts(x::IntegerArray, wv::AbstractWeights) = counts(x, span(x), wv) Return the proportion of values in the range `levels` that occur in `x`. Equivalent to `counts(x, levels) / length(x)`. -If a weighting vector `wv` is specified, the proportion of weight is used rather than the -raw counts. +If a vector of weights `wv` is provided, the proportion of weights is computed rather +than the proportion of raw counts. """ proportions(x::IntegerArray, levels::IntUnitRange) = counts(x, levels) .* inv(length(x)) proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = @@ -110,8 +110,8 @@ proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = Return the proportion of integers in 1 to `k` that occur in `x`. -If a weighting vector `wv` is specified, the proportion of weight is used rather than the -raw counts. +If a vector of weights `wv` is provided, the proportion of weights is computed rather +than the proportion of raw counts. """ proportions(x::IntegerArray, k::Integer) = proportions(x, 1:k) proportions(x::IntegerArray, k::Integer, wv::AbstractWeights) = proportions(x, 1:k, wv) @@ -414,7 +414,7 @@ end Return a dictionary mapping each unique value in `x` to its number of occurrences. -A vector of weights `wv` can be provided and the sum of the weights is used rather than the +If a weighting vector `wv` is specified, the sum of weights is used rather than the raw counts. `alg` is only allowed for unweighted counting and can be one of: @@ -441,7 +441,7 @@ countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcoun Return a dictionary mapping each unique value in `x` to its proportion in `x`. -If a vector of weights `wv` is provided, the weighted proportion is computed rather +If a vector of weights `wv` is provided, the proportion of weights is computed rather than the proportion of raw counts. `alg` is only allowed for unweighted counting and can be one of: From 657c1ecf853353d176d8fb5d4ea95c3c72c5a4d0 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sun, 24 Oct 2021 15:06:29 -0500 Subject: [PATCH 30/43] docstring typo Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index c37c1af5a..fb131e584 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -410,7 +410,7 @@ end """ countmap(x; alg = :auto) - countmap(x::AbstractVector, w::AbstractVector{<:Real}) + countmap(x::AbstractVector, wv::AbstractVector{<:Real}) Return a dictionary mapping each unique value in `x` to its number of occurrences. From 2ebf1f3b4381b8f696cceb41eb47d4659e9bd62f Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Sun, 24 Oct 2021 15:20:22 -0500 Subject: [PATCH 31/43] nalimilan's style preference --- test/counts.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index 46d4d60f7..86d4cbbd5 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -193,9 +193,9 @@ end @test proportions(x, xw) == proportions(y, yw) == proportions(z, zw) @test proportionmap(x) == proportionmap(y) == proportionmap(z) @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) - @test (countmap(x) == countmap(x; alg = :dict) == countmap(x; alg = :radixsort) == - countmap(y) == countmap(y; alg = :dict) == countmap(y; alg = :radixsort) == - countmap(z) == countmap(z; alg = :dict) == countmap(z; alg = :radixsort)) + @test countmap(x) == countmap(x; alg = :dict) == countmap(x; alg = :radixsort) == + countmap(y) == countmap(y; alg = :dict) == countmap(y; alg = :radixsort) == + countmap(z) == countmap(z; alg = :dict) == countmap(z; alg = :radixsort) @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) # countmap and proportionmap only support the :dict algorithm for weighted sums. end From f469f0cebd6d97c29a93bcdbd8345f43912c5015 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Tue, 26 Oct 2021 07:26:42 -0500 Subject: [PATCH 32/43] vectorize before sorting in iterator case; remove @boundcheck annotations --- src/counts.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index fb131e584..347074a3c 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -27,7 +27,7 @@ raw counts. function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange) # add counts of integers from x that fall within levels to r - @boundscheck checkbounds(r, axes(levels)...) + checkbounds(r, axes(levels)...) m0 = first(levels) m1 = last(levels) @@ -49,7 +49,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv: xv = vec(x) # discard shape because weights() discards shape - @boundscheck checkbounds(r, axes(levels)...) + checkbounds(r, axes(levels)...) m0 = first(levels) m1 = last(levels) @@ -385,7 +385,7 @@ end # fall-back for `x` an iterator function addcounts_radixsort!(cm::Dict{T}, x) where T - sx = sort!(collect(x), alg = RadixSort) + sx = sort!(vec(collect(x)), alg = RadixSort) return _addcounts_radix_sort_loop!(cm, sx) end From 6cf64d93c08fc6d721ed353bdbef56801f9b80b4 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Tue, 26 Oct 2021 07:27:15 -0500 Subject: [PATCH 33/43] skip OffsetArray tests before 1.6 --- test/counts.jl | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index 86d4cbbd5..6a5ef19f9 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -179,23 +179,25 @@ end @test countmap(X) == countmap(copy(X)) end -@testset "offset arrays" begin - x = rand(1:5, n) - w = rand(n) - xw = weights(w) - y = OffsetArray(x, n÷2) - yw = weights(OffsetArray(w, n÷2)) - z = OffsetArray(x, -2n) - zw = weights(OffsetArray(w, -2n)) - - # proportions calls counts which calls addcounts! - @test proportions(x) == proportions(y) == proportions(z) - @test proportions(x, xw) == proportions(y, yw) == proportions(z, zw) - @test proportionmap(x) == proportionmap(y) == proportionmap(z) - @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) - @test countmap(x) == countmap(x; alg = :dict) == countmap(x; alg = :radixsort) == - countmap(y) == countmap(y; alg = :dict) == countmap(y; alg = :radixsort) == - countmap(z) == countmap(z; alg = :dict) == countmap(z; alg = :radixsort) - @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) - # countmap and proportionmap only support the :dict algorithm for weighted sums. +if VERSION >= v"1.6" + @testset "offset arrays" begin + x = rand(1:5, n) + w = rand(n) + xw = weights(w) + y = OffsetArray(x, n÷2) + yw = weights(OffsetArray(w, n÷2)) + z = OffsetArray(x, -2n) + zw = weights(OffsetArray(w, -2n)) + + # proportions calls counts which calls addcounts! + @test proportions(x) == proportions(y) == proportions(z) + @test proportions(x, xw) == proportions(y, yw) == proportions(z, zw) + @test proportionmap(x) == proportionmap(y) == proportionmap(z) + @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) + @test countmap(x) == countmap(x; alg = :dict) == countmap(x; alg = :radixsort) == + countmap(y) == countmap(y; alg = :dict) == countmap(y; alg = :radixsort) == + countmap(z) == countmap(z; alg = :dict) == countmap(z; alg = :radixsort) + @test proportionmap(x, xw) == proportionmap(y, yw) == proportionmap(z, zw) + # countmap and proportionmap only support the :dict algorithm for weighted sums. + end end From dc1f7ae7d0ac65ad83e006817ec7e1cb4a037fb1 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Tue, 17 May 2022 06:36:39 -0500 Subject: [PATCH 34/43] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 347074a3c..b28b35fbc 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -44,8 +44,8 @@ end function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) # add wv weighted counts of integers from x that fall within levels to r - length(x) == length(wv) || throw(DimensionMismatch( - "x and wv must have the same length, got $(length(x)) and $(length(wv))")) + length(x) == length(wv) || + throw(DimensionMismatch("x and wv must have the same length, got $(length(x)) and $(length(wv))")) xv = vec(x) # discard shape because weights() discards shape From 4fd45d913d88af201eb1de0e983560f35f3f0e14 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Tue, 17 May 2022 06:48:21 -0500 Subject: [PATCH 35/43] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index b28b35fbc..617471340 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -125,11 +125,8 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels:: xlevels, ylevels = levels - axes(x) == axes(y) || - throw(DimensionMismatch("x and y must have the same axes, but got $(axes(x)) and $(axes(y))")) - axes(r) == (axes(xlevels, 1), axes(ylevels, 1)) || - throw(DimensionMismatch("axes(r) must correspond to the axes of levels, but got $(axes(r)) ≠ $(axes(xlevels, 1)), $(axes(ylevels, 1))")) + checkbounds(r, axes(xlevels, 1), axes(ylevels, 1)) mx0 = first(xlevels) mx1 = last(xlevels) @@ -139,7 +136,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels:: bx = mx0 - 1 by = my0 - 1 - for i in eachindex(x, y) + for i in eachindex(vec(x), vec(y)) xi = x[i] yi = y[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) @@ -163,8 +160,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, xlevels, ylevels = levels - axes(r) == (axes(xlevels, 1), axes(ylevels, 1)) || throw(DimensionMismatch( - "axes(r) must correspond to the axes of levels, got $(axes(r)) ≠ $(axes(xlevels, 1)), $(axes(ylevels, 1))")) + checkbounds(r, axes(xlevels, 1), axes(ylevels, 1)) mx0 = first(xlevels) mx1 = last(xlevels) From 0fd1c27fa4eaec8229b83d93bda9291e94ae83d5 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Tue, 17 May 2022 15:12:39 -0500 Subject: [PATCH 36/43] Update src/counts.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bogumił Kamiński --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 617471340..d1cb3655a 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -16,7 +16,7 @@ end #### functions for counting a single list of integers (1D) """ - addcounts!(r, x, levels::UnitRange{<:Int}, [wv::AbstractWeights]) + addcounts!(r, x, levels::UnitRange{<:Integer}, [wv::AbstractWeights]) Add the number of occurrences in `x` of each value in `levels` to an existing array `r`. From fbdf5642b61ef2d6ddd744266a4b106a61ce6196 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Thu, 19 May 2022 12:54:19 -0500 Subject: [PATCH 37/43] Update src/counts.jl --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index d1cb3655a..f8eab8b85 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -19,7 +19,7 @@ end addcounts!(r, x, levels::UnitRange{<:Integer}, [wv::AbstractWeights]) Add the number of occurrences in `x` of each value in `levels` to an existing -array `r`. +array `r`. For each `xi ∈ x`, if `xi == levels[j]`, then we increment `r[j]`. If a weighting vector `wv` is specified, the sum of weights is used rather than the raw counts. From e1fab9900cb0737cd76a4c4c80d03dd63c02b61b Mon Sep 17 00:00:00 2001 From: Lilith Hafner Date: Thu, 2 Jun 2022 16:44:09 -0400 Subject: [PATCH 38/43] use firstindex instead of 1 for robustness to future type signature expansion --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index f8eab8b85..7f4389873 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -31,7 +31,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange) m0 = first(levels) m1 = last(levels) - b = m0 - 1 + b = m0 - firstindex(levels) # firstindex(levels) == 1 because levels::IntUnitRange @inbounds for xi in x if m0 <= xi <= m1 From 717c795b8329251344511241230b1e1cfe350579 Mon Sep 17 00:00:00 2001 From: Lilith Hafner Date: Mon, 6 Jun 2022 11:23:01 -0400 Subject: [PATCH 39/43] Switch from SortingAlgorithms to Base's radix sort in Julia 1.9+ (closes #796) --- src/counts.jl | 14 ++++++++++++-- test/counts.jl | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 7f4389873..c398a5b7d 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -369,9 +369,18 @@ function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractVector{T}) where T return cm end +function _alg(x::AbstractArray) + @static if VERSION >= v"1.9.0-DEV" + Base.DEFAULT_UNSTABLE + else + firstindex(x) == 1 || error("addcounts_radixsort! requires either one based indexing or Julia 1.9. Use `alg = :dict` as an alternative.") + SortingAlgorithms.RadixSort + end +end + function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T # sort the x using radixsort - sx = sort(vec(x), alg = RadixSort) + sx = sort(vec(x), alg=_alg(x)) # Delegate the loop to a separate function since sort might not # be inferred in Julia 0.6 after SortingAlgorithms is loaded. @@ -381,7 +390,8 @@ end # fall-back for `x` an iterator function addcounts_radixsort!(cm::Dict{T}, x) where T - sx = sort!(vec(collect(x)), alg = RadixSort) + cx = vec(collect(x)) + sx = sort!(cx, alg = _alg(cx)) return _addcounts_radix_sort_loop!(cm, sx) end diff --git a/test/counts.jl b/test/counts.jl index 6a5ef19f9..f5d6ae69f 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -179,7 +179,7 @@ end @test countmap(X) == countmap(copy(X)) end -if VERSION >= v"1.6" +if VERSION >= v"1.9.0-DEV" @testset "offset arrays" begin x = rand(1:5, n) w = rand(n) From ba59cc3f32b4f1ba0442e90db33695a3cb7534ee Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Mon, 6 Jun 2022 17:04:12 -0400 Subject: [PATCH 40/43] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index c398a5b7d..f217a1dcd 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -373,7 +373,9 @@ function _alg(x::AbstractArray) @static if VERSION >= v"1.9.0-DEV" Base.DEFAULT_UNSTABLE else - firstindex(x) == 1 || error("addcounts_radixsort! requires either one based indexing or Julia 1.9. Use `alg = :dict` as an alternative.") + firstindex(x) == 1 || + throw(ArgumentError("alg=:radixsort requires either one based indexing or Julia >= 1.9. " + "Use `alg = :dict` as an alternative.")) SortingAlgorithms.RadixSort end end @@ -464,5 +466,5 @@ than the proportion of raw counts. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -proportionmap(x::AbstractArray; alg = :auto) = _normalize_countmap(countmap(x; alg = alg), length(x)) +proportionmap(x::AbstractArray) = _normalize_countmap(countmap(x), length(x)) proportionmap(x::AbstractArray, wv::AbstractWeights) = _normalize_countmap(countmap(x, wv), sum(wv)) From 2dfba9f684eb37692968d18ca1d0aa81ebdaeecf Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Mon, 6 Jun 2022 17:05:42 -0400 Subject: [PATCH 41/43] Fix typo --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index f217a1dcd..1d0c873dd 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -375,7 +375,7 @@ function _alg(x::AbstractArray) else firstindex(x) == 1 || throw(ArgumentError("alg=:radixsort requires either one based indexing or Julia >= 1.9. " - "Use `alg = :dict` as an alternative.")) + * "Use `alg = :dict` as an alternative.")) SortingAlgorithms.RadixSort end end From 8f446ee49bdf3dcf591f1fa32113de01d6ee7225 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> Date: Mon, 6 Jun 2022 17:06:17 -0400 Subject: [PATCH 42/43] Style --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 1d0c873dd..0240d6deb 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -374,7 +374,7 @@ function _alg(x::AbstractArray) Base.DEFAULT_UNSTABLE else firstindex(x) == 1 || - throw(ArgumentError("alg=:radixsort requires either one based indexing or Julia >= 1.9. " + throw(ArgumentError("alg = :radixsort requires either one based indexing or Julia >= 1.9. " * "Use `alg = :dict` as an alternative.")) SortingAlgorithms.RadixSort end From e20e28c19de810f238b524ab70ad937732161e61 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 8 Jun 2022 09:24:32 +0200 Subject: [PATCH 43/43] Minor fixes --- src/counts.jl | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 0240d6deb..4333edc25 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -371,12 +371,12 @@ end function _alg(x::AbstractArray) @static if VERSION >= v"1.9.0-DEV" - Base.DEFAULT_UNSTABLE + return Base.DEFAULT_UNSTABLE else firstindex(x) == 1 || - throw(ArgumentError("alg = :radixsort requires either one based indexing or Julia >= 1.9. " - * "Use `alg = :dict` as an alternative.")) - SortingAlgorithms.RadixSort + throw(ArgumentError("alg = :radixsort requires either one based indexing or Julia >= 1.9. " * + "Use `alg = :dict` as an alternative.")) + return SortingAlgorithms.RadixSort end end @@ -444,27 +444,13 @@ countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcoun """ - proportionmap(x; alg = :auto) + proportionmap(x) proportionmap(x::AbstractVector, w::AbstractVector{<:Real}) Return a dictionary mapping each unique value in `x` to its proportion in `x`. If a vector of weights `wv` is provided, the proportion of weights is computed rather than the proportion of raw counts. - -`alg` is only allowed for unweighted counting and can be one of: -- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use - `:radixsort`, otherwise use `:dict`. - -- `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the - [radix sort](https://en.wikipedia.org/wiki/Radix_sort) - algorithm to sort the input vector which will generally lead to - shorter running time. However the radix sort algorithm creates a - copy of the input vector and hence uses more RAM. Choose `:dict` - if the amount of available RAM is a limitation. - -- `:dict`: use `Dict`-based method which is generally slower but uses less - RAM and is safe for any data type. """ proportionmap(x::AbstractArray) = _normalize_countmap(countmap(x), length(x)) proportionmap(x::AbstractArray, wv::AbstractWeights) = _normalize_countmap(countmap(x, wv), sum(wv))