JuliaStats · nalimilan · Jun 8, 2022 · Oct 2, 2021 · Oct 7, 2021 · Oct 10, 2021
diff --git a/Project.toml b/Project.toml
@@ -28,8 +28,9 @@ julia = "1"
 [extras]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Dates", "DelimitedFiles", "StableRNGs", "Test"]
+test = ["Dates", "DelimitedFiles", "OffsetArrays", "StableRNGs", "Test"]
diff --git a/src/counts.jl b/src/counts.jl
@@ -16,24 +16,24 @@ end
 
 #### functions for counting a single list of integers (1D)
 """
-    addcounts!(r, x, levels::UnitRange{<:Int}, [wv::AbstractWeights])
+    addcounts!(r, x, levels::UnitRange{<:Integer}, [wv::AbstractWeights])
 
 Add the number of occurrences in `x` of each value in `levels` to an existing
-array `r`. If a weighting vector `wv` is specified, the sum of weights is used
-rather than the raw counts.
+array `r`. For each `xi ∈ x`, if `xi == levels[j]`, then we increment `r[j]`.
+
+If a weighting vector `wv` is specified, the sum of weights is used rather than the
+raw counts.
 """
 function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange)
-    # add counts of integers from x to r
+    # add counts of integers from x that fall within levels to r
 
-    k = length(levels)
-    length(r) == k || throw(DimensionMismatch())
+    checkbounds(r, axes(levels)...)
-    checkbounds(r, axes(levels)...)
+    (firstindex(r) == 1) && length(r) == k) || throw(DimensionMismatch())
-    checkbounds(r, axes(levels)...)
+    (firstindex(r) == 1) && length(r) == k) || throw(DimensionMismatch())
 
-    m0 = levels[1]
-    m1 = levels[end]
-    b = m0 - 1
+    m0 = first(levels)
+    m1 = last(levels)
+    b = m0 - firstindex(levels) # firstindex(levels) == 1 because levels::IntUnitRange
 
-    @inbounds for i in 1 : length(x)
-        xi = x[i]
+    @inbounds for xi in x
         if m0 <= xi <= m1
             r[xi - b] += 1
         end
@@ -42,15 +42,21 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange)
 end
 
 function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights)
-    k = length(levels)
-    length(r) == k || throw(DimensionMismatch())
+    # add wv weighted counts of integers from x that fall within levels to r
+
+    length(x) == length(wv) ||
+        throw(DimensionMismatch("x and wv must have the same length, got $(length(x)) and $(length(wv))"))
 
-    m0 = levels[1]
-    m1 = levels[end]
+    xv = vec(x) # discard shape because weights() discards shape
+
+    checkbounds(r, axes(levels)...)
+
+    m0 = first(levels)
+    m1 = last(levels)
     b = m0 - 1
 
-    @inbounds for i in 1 : length(x)
-        xi = x[i]
+    @inbounds for i in eachindex(xv, wv)
+        xi = xv[i]
         if m0 <= xi <= m1
             r[xi - b] += wv[i]
         end
@@ -69,8 +75,8 @@ falling in that range will be considered (the others will be ignored without
 raising an error or a warning). If an integer `k` is provided, only values in the
 range `1:k` will be considered.
 
-If a weighting vector `wv` is specified, the sum of the weights is used rather than the
-raw counts.
+If a vector of weights `wv` is provided, the proportion of weights is computed rather
+than the proportion of raw counts.
 
 The output is a vector of length `length(levels)`.
 """
@@ -90,8 +96,10 @@ counts(x::IntegerArray, wv::AbstractWeights) = counts(x, span(x), wv)
     proportions(x, levels=span(x), [wv::AbstractWeights])
 
 Return the proportion of values in the range `levels` that occur in `x`.
-Equivalent to `counts(x, levels) / length(x)`. If a weighting vector `wv`
-is specified, the sum of the weights is used rather than the raw counts.
+Equivalent to `counts(x, levels) / length(x)`.
+
+If a vector of weights `wv` is provided, the proportion of weights is computed rather
+than the proportion of raw counts.
 """
 proportions(x::IntegerArray, levels::IntUnitRange) = counts(x, levels) .* inv(length(x))
 proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) =
@@ -101,6 +109,9 @@ proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) =
     proportions(x, k::Integer, [wv::AbstractWeights])
 
 Return the proportion of integers in 1 to `k` that occur in `x`.
+
+If a vector of weights `wv` is provided, the proportion of weights is computed rather
+than the proportion of raw counts.
 """
 proportions(x::IntegerArray, k::Integer) = proportions(x, 1:k)
 proportions(x::IntegerArray, k::Integer, wv::AbstractWeights) = proportions(x, 1:k, wv)
@@ -110,26 +121,22 @@ proportions(x::IntegerArray, wv::AbstractWeights) = proportions(x, span(x), wv)
 #### functions for counting a single list of integers (2D)
 
 function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange})
-    # add counts of integers from x to r
-
-    n = length(x)
-    length(y) == n || throw(DimensionMismatch())
+    # add counts of pairs from zip(x,y) to r
 
     xlevels, ylevels = levels
 
-    kx = length(xlevels)
-    ky = length(ylevels)
-    size(r) == (kx, ky) || throw(DimensionMismatch())
 
-    mx0 = xlevels[1]
-    mx1 = xlevels[end]
-    my0 = ylevels[1]
-    my1 = ylevels[end]
+    checkbounds(r, axes(xlevels, 1), axes(ylevels, 1))
+
+    mx0 = first(xlevels)
+    mx1 = last(xlevels)
+    my0 = first(ylevels)
+    my1 = last(ylevels)
 
     bx = mx0 - 1
     by = my0 - 1
 
-    for i = 1:n
+    for i in eachindex(vec(x), vec(y))
         xi = x[i]
         yi = y[i]
         if (mx0 <= xi <= mx1) && (my0 <= yi <= my1)
@@ -141,28 +148,31 @@ end
 
 function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray,
                     levels::NTuple{2,IntUnitRange}, wv::AbstractWeights)
-    # add counts of integers from x to r
+    # add counts of pairs from zip(x,y) to r
+
+    length(x) == length(y) == length(wv) ||
+        throw(DimensionMismatch("x, y, and wv must have the same length, but got $(length(x)), $(length(y)), and $(length(wv))"))
+
+    axes(x) == axes(y) ||
+        throw(DimensionMismatch("x and y must have the same axes, but got $(axes(x)) and $(axes(y))"))
 
-    n = length(x)
-    length(y) == length(wv) == n || throw(DimensionMismatch())
+    xv, yv = vec(x), vec(y) # discard shape because weights() discards shape
 
     xlevels, ylevels = levels
 
-    kx = length(xlevels)
-    ky = length(ylevels)
-    size(r) == (kx, ky) || throw(DimensionMismatch())
+    checkbounds(r, axes(xlevels, 1), axes(ylevels, 1))
 
-    mx0 = xlevels[1]
-    mx1 = xlevels[end]
-    my0 = ylevels[1]
-    my1 = ylevels[end]
+    mx0 = first(xlevels)
+    mx1 = last(xlevels)
+    my0 = first(ylevels)
+    my1 = last(ylevels)
 
     bx = mx0 - 1
     by = my0 - 1
 
-    for i = 1:n
-        xi = x[i]
-        yi = y[i]
+    for i in eachindex(xv, yv, wv)
+        xi = xv[i]
+        yi = yv[i]
         if (mx0 <= xi <= mx1) && (my0 <= yi <= my1)
             r[xi - bx, yi - by] += wv[i]
         end
@@ -235,13 +245,15 @@ end
 
 
 """
-    addcounts!(dict, x[, wv]; alg = :auto)
+    addcounts!(dict, x; alg = :auto)
+    addcounts!(dict, x, wv)
 
 Add counts based on `x` to a count map. New entries will be added if new values come up.
+
 If a weighting vector `wv` is specified, the sum of the weights is used rather than the
 raw counts.
 
-`alg` can be one of:
+`alg` is only allowed for unweighted counting and can be one of:
 - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
                      `:radixsort`, otherwise use `:dict`.
 
@@ -284,9 +296,9 @@ function addcounts_dict!(cm::Dict{T}, x) where T
 end
 
 # If the bits type is of small size i.e. it can have up to 65536 distinct values
-# then it is always better to apply a counting-sort like reduce algorithm for 
+# then it is always better to apply a counting-sort like reduce algorithm for
 # faster results and less memory usage. However we still wish to enable others
-# to write generic algorithms, therefore the methods below still accept the 
+# to write generic algorithms, therefore the methods below still accept the
 # `alg` argument but it is ignored.
 function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored)
     sumx = sum(x)
@@ -335,32 +347,42 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128,
 "Can the type be safely sorted by radixsort"
 radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes
 
-function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T
+function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractVector{T}) where T
     isempty(sx) && return cm
-    last_sx = sx[1]
-    tmpcount = get(cm, last_sx, 0) + 1
+    last_sx = first(sx)
+    start_i = firstindex(sx)
 
     # now the data is sorted: can just run through and accumulate values before
     # adding into the Dict
-    @inbounds for i in 2:length(sx)
+    @inbounds for i in start_i+1:lastindex(sx)
         sxi = sx[i]
-        if last_sx == sxi
-            tmpcount += 1
-        else
-            cm[last_sx] = tmpcount
+        if last_sx != sxi
+            cm[last_sx] = get(cm, last_sx, 0) + i - start_i
             last_sx = sxi
-            tmpcount = get(cm, last_sx, 0) + 1
+            start_i = i
         end
     end
 
-    cm[sx[end]] = tmpcount
+    last_sx = last(sx)
+    cm[last_sx] = get(cm, last_sx, 0) + lastindex(sx) + 1 - start_i
 
     return cm
 end
 
+function _alg(x::AbstractArray)
+    @static if VERSION >= v"1.9.0-DEV"
+        Base.DEFAULT_UNSTABLE
+    else
+        firstindex(x) == 1 ||
+            throw(ArgumentError("alg=:radixsort requires either one based indexing or Julia >= 1.9. "
+                              * "Use `alg = :dict` as an alternative."))
+        SortingAlgorithms.RadixSort
+    end
+end
+
 function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T
     # sort the x using radixsort
-    sx = sort(x, alg = RadixSort)
+    sx = sort(vec(x), alg=_alg(x))
 
     # Delegate the loop to a separate function since sort might not
     # be inferred in Julia 0.6 after SortingAlgorithms is loaded.
@@ -369,18 +391,24 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T
 end
 
 # fall-back for `x` an iterator
-function addcounts_radixsort!(cm::Dict{T}, x) where T 
-    sx = sort!(collect(x), alg = RadixSort)
+function addcounts_radixsort!(cm::Dict{T}, x) where T
+    cx = vec(collect(x))
+    sx = sort!(cx, alg = _alg(cx))
     return _addcounts_radix_sort_loop!(cm, sx)
 end
 
 function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real}
-    n = length(x)
-    length(wv) == n || throw(DimensionMismatch())
+    # add wv weighted counts of integers from x to cm
+
+    length(x) == length(wv) ||
+        throw(DimensionMismatch("x and wv must have the same length, got $(length(x)) and $(length(wv))"))
+
+    xv = vec(x) # discard shape because weights() discards shape
+
     z = zero(W)
 
-    for i = 1 : n
-        @inbounds xi = x[i]
+    for i in eachindex(xv, wv)
+        @inbounds xi = xv[i]
         @inbounds wi = wv[i]
         cm[xi] = get(cm, xi, z) + wi
     end
@@ -390,11 +418,14 @@ end
 
 """
     countmap(x; alg = :auto)
-    countmap(x::AbstractVector, w::AbstractVector{<:Real}; alg = :auto)
+    countmap(x::AbstractVector, wv::AbstractVector{<:Real})
 
-Return a dictionary mapping each unique value in `x` to its number
-of occurrences. A vector of weights `w` can be provided when `x` is a vector.
+Return a dictionary mapping each unique value in `x` to its number of occurrences.
 
+If a weighting vector `wv` is specified, the sum of weights is used rather than the
+raw counts.
+
+`alg` is only allowed for unweighted counting and can be one of:
 - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
                      `:radixsort`, otherwise use `:dict`.
 
@@ -413,10 +444,27 @@ countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcoun
 
 
 """
-    proportionmap(x)
+    proportionmap(x; alg = :auto)
+    proportionmap(x::AbstractVector, w::AbstractVector{<:Real})
+
+Return a dictionary mapping each unique value in `x` to its proportion in `x`.
+
+If a vector of weights `wv` is provided, the proportion of weights is computed rather
+than the proportion of raw counts.
+
+`alg` is only allowed for unweighted counting and can be one of:
+- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
+                     `:radixsort`, otherwise use `:dict`.
 
-Return a dictionary mapping each unique value in `x` to its
-proportion in `x`.
+- `:radixsort`:      if `radixsort_safe(eltype(x)) == true` then use the
+                     [radix sort](https://en.wikipedia.org/wiki/Radix_sort)
+                     algorithm to sort the input vector which will generally lead to
+                     shorter running time. However the radix sort algorithm creates a
+                     copy of the input vector and hence uses more RAM. Choose `:dict`
+                     if the amount of available RAM is a limitation.
+
+- `:dict`:           use `Dict`-based method which is generally slower but uses less
+                     RAM and is safe for any data type.
 """
 proportionmap(x::AbstractArray) = _normalize_countmap(countmap(x), length(x))
 proportionmap(x::AbstractArray, wv::AbstractWeights) = _normalize_countmap(countmap(x, wv), sum(wv))
diff --git a/src/weights.jl b/src/weights.jl
@@ -21,6 +21,7 @@ length(wv::AbstractWeights) = length(wv.values)
 sum(wv::AbstractWeights) = wv.sum
 isempty(wv::AbstractWeights) = isempty(wv.values)
 size(wv::AbstractWeights) = size(wv.values)
+Base.axes(wv::AbstractWeights) = Base.axes(wv.values)
 
 Base.convert(::Type{Vector}, wv::AbstractWeights) = convert(Vector, wv.values)
 
@@ -299,6 +300,7 @@ sum(wv::UnitWeights{T}) where T = convert(T, length(wv))
 isempty(wv::UnitWeights) = iszero(wv.len)
 length(wv::UnitWeights) = wv.len
 size(wv::UnitWeights) = tuple(length(wv))
+Base.axes(wv::UnitWeights) = tuple(Base.OneTo(length(wv)))
 
 Base.convert(::Type{Vector}, wv::UnitWeights{T}) where {T} = ones(T, length(wv))