Skip to content

Commit

Permalink
make counting more robust to input datatype (#722)
Browse files Browse the repository at this point in the history
  • Loading branch information
LilithHafner authored Jun 8, 2022
1 parent 1815e1e commit f0cccd6
Show file tree
Hide file tree
Showing 5 changed files with 304 additions and 228 deletions.
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ julia = "1"
[extras]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Dates", "DelimitedFiles", "StableRNGs", "Test"]
test = ["Dates", "DelimitedFiles", "OffsetArrays", "StableRNGs", "Test"]
178 changes: 106 additions & 72 deletions src/counts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,24 @@ end

#### functions for counting a single list of integers (1D)
"""
addcounts!(r, x, levels::UnitRange{<:Int}, [wv::AbstractWeights])
addcounts!(r, x, levels::UnitRange{<:Integer}, [wv::AbstractWeights])
Add the number of occurrences in `x` of each value in `levels` to an existing
array `r`. If a weighting vector `wv` is specified, the sum of weights is used
rather than the raw counts.
array `r`. For each `xi ∈ x`, if `xi == levels[j]`, then we increment `r[j]`.
If a weighting vector `wv` is specified, the sum of weights is used rather than the
raw counts.
"""
function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange)
# add counts of integers from x to r
# add counts of integers from x that fall within levels to r

k = length(levels)
length(r) == k || throw(DimensionMismatch())
checkbounds(r, axes(levels)...)

m0 = levels[1]
m1 = levels[end]
b = m0 - 1
m0 = first(levels)
m1 = last(levels)
b = m0 - firstindex(levels) # firstindex(levels) == 1 because levels::IntUnitRange

@inbounds for i in 1 : length(x)
xi = x[i]
@inbounds for xi in x
if m0 <= xi <= m1
r[xi - b] += 1
end
Expand All @@ -42,15 +42,21 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange)
end

function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights)
k = length(levels)
length(r) == k || throw(DimensionMismatch())
# add wv weighted counts of integers from x that fall within levels to r

length(x) == length(wv) ||
throw(DimensionMismatch("x and wv must have the same length, got $(length(x)) and $(length(wv))"))

xv = vec(x) # discard shape because weights() discards shape

checkbounds(r, axes(levels)...)

m0 = levels[1]
m1 = levels[end]
m0 = first(levels)
m1 = last(levels)
b = m0 - 1

@inbounds for i in 1 : length(x)
xi = x[i]
@inbounds for i in eachindex(xv, wv)
xi = xv[i]
if m0 <= xi <= m1
r[xi - b] += wv[i]
end
Expand All @@ -69,8 +75,8 @@ falling in that range will be considered (the others will be ignored without
raising an error or a warning). If an integer `k` is provided, only values in the
range `1:k` will be considered.
If a weighting vector `wv` is specified, the sum of the weights is used rather than the
raw counts.
If a vector of weights `wv` is provided, the proportion of weights is computed rather
than the proportion of raw counts.
The output is a vector of length `length(levels)`.
"""
Expand All @@ -90,8 +96,10 @@ counts(x::IntegerArray, wv::AbstractWeights) = counts(x, span(x), wv)
proportions(x, levels=span(x), [wv::AbstractWeights])
Return the proportion of values in the range `levels` that occur in `x`.
Equivalent to `counts(x, levels) / length(x)`. If a weighting vector `wv`
is specified, the sum of the weights is used rather than the raw counts.
Equivalent to `counts(x, levels) / length(x)`.
If a vector of weights `wv` is provided, the proportion of weights is computed rather
than the proportion of raw counts.
"""
proportions(x::IntegerArray, levels::IntUnitRange) = counts(x, levels) .* inv(length(x))
proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) =
Expand All @@ -101,6 +109,9 @@ proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) =
proportions(x, k::Integer, [wv::AbstractWeights])
Return the proportion of integers in 1 to `k` that occur in `x`.
If a vector of weights `wv` is provided, the proportion of weights is computed rather
than the proportion of raw counts.
"""
proportions(x::IntegerArray, k::Integer) = proportions(x, 1:k)
proportions(x::IntegerArray, k::Integer, wv::AbstractWeights) = proportions(x, 1:k, wv)
Expand All @@ -110,26 +121,22 @@ proportions(x::IntegerArray, wv::AbstractWeights) = proportions(x, span(x), wv)
#### functions for counting a single list of integers (2D)

function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange})
# add counts of integers from x to r

n = length(x)
length(y) == n || throw(DimensionMismatch())
# add counts of pairs from zip(x,y) to r

xlevels, ylevels = levels

kx = length(xlevels)
ky = length(ylevels)
size(r) == (kx, ky) || throw(DimensionMismatch())

mx0 = xlevels[1]
mx1 = xlevels[end]
my0 = ylevels[1]
my1 = ylevels[end]
checkbounds(r, axes(xlevels, 1), axes(ylevels, 1))

mx0 = first(xlevels)
mx1 = last(xlevels)
my0 = first(ylevels)
my1 = last(ylevels)

bx = mx0 - 1
by = my0 - 1

for i = 1:n
for i in eachindex(vec(x), vec(y))
xi = x[i]
yi = y[i]
if (mx0 <= xi <= mx1) && (my0 <= yi <= my1)
Expand All @@ -141,28 +148,31 @@ end

function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray,
levels::NTuple{2,IntUnitRange}, wv::AbstractWeights)
# add counts of integers from x to r
# add counts of pairs from zip(x,y) to r

length(x) == length(y) == length(wv) ||
throw(DimensionMismatch("x, y, and wv must have the same length, but got $(length(x)), $(length(y)), and $(length(wv))"))

n = length(x)
length(y) == length(wv) == n || throw(DimensionMismatch())
axes(x) == axes(y) ||
throw(DimensionMismatch("x and y must have the same axes, but got $(axes(x)) and $(axes(y))"))

xv, yv = vec(x), vec(y) # discard shape because weights() discards shape

xlevels, ylevels = levels

kx = length(xlevels)
ky = length(ylevels)
size(r) == (kx, ky) || throw(DimensionMismatch())
checkbounds(r, axes(xlevels, 1), axes(ylevels, 1))

mx0 = xlevels[1]
mx1 = xlevels[end]
my0 = ylevels[1]
my1 = ylevels[end]
mx0 = first(xlevels)
mx1 = last(xlevels)
my0 = first(ylevels)
my1 = last(ylevels)

bx = mx0 - 1
by = my0 - 1

for i = 1:n
xi = x[i]
yi = y[i]
for i in eachindex(xv, yv, wv)
xi = xv[i]
yi = yv[i]
if (mx0 <= xi <= mx1) && (my0 <= yi <= my1)
r[xi - bx, yi - by] += wv[i]
end
Expand Down Expand Up @@ -235,13 +245,15 @@ end


"""
addcounts!(dict, x[, wv]; alg = :auto)
addcounts!(dict, x; alg = :auto)
addcounts!(dict, x, wv)
Add counts based on `x` to a count map. New entries will be added if new values come up.
If a weighting vector `wv` is specified, the sum of the weights is used rather than the
raw counts.
`alg` can be one of:
`alg` is only allowed for unweighted counting and can be one of:
- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
`:radixsort`, otherwise use `:dict`.
Expand Down Expand Up @@ -284,9 +296,9 @@ function addcounts_dict!(cm::Dict{T}, x) where T
end

# If the bits type is of small size i.e. it can have up to 65536 distinct values
# then it is always better to apply a counting-sort like reduce algorithm for
# then it is always better to apply a counting-sort like reduce algorithm for
# faster results and less memory usage. However we still wish to enable others
# to write generic algorithms, therefore the methods below still accept the
# to write generic algorithms, therefore the methods below still accept the
# `alg` argument but it is ignored.
function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored)
sumx = sum(x)
Expand Down Expand Up @@ -335,32 +347,42 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128,
"Can the type be safely sorted by radixsort"
radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes

function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T
function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractVector{T}) where T
isempty(sx) && return cm
last_sx = sx[1]
tmpcount = get(cm, last_sx, 0) + 1
last_sx = first(sx)
start_i = firstindex(sx)

# now the data is sorted: can just run through and accumulate values before
# adding into the Dict
@inbounds for i in 2:length(sx)
@inbounds for i in start_i+1:lastindex(sx)
sxi = sx[i]
if last_sx == sxi
tmpcount += 1
else
cm[last_sx] = tmpcount
if last_sx != sxi
cm[last_sx] = get(cm, last_sx, 0) + i - start_i
last_sx = sxi
tmpcount = get(cm, last_sx, 0) + 1
start_i = i
end
end

cm[sx[end]] = tmpcount
last_sx = last(sx)
cm[last_sx] = get(cm, last_sx, 0) + lastindex(sx) + 1 - start_i

return cm
end

function _alg(x::AbstractArray)
@static if VERSION >= v"1.9.0-DEV"
return Base.DEFAULT_UNSTABLE
else
firstindex(x) == 1 ||
throw(ArgumentError("alg = :radixsort requires either one based indexing or Julia >= 1.9. " *
"Use `alg = :dict` as an alternative."))
return SortingAlgorithms.RadixSort
end
end

function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T
# sort the x using radixsort
sx = sort(x, alg = RadixSort)
sx = sort(vec(x), alg=_alg(x))

# Delegate the loop to a separate function since sort might not
# be inferred in Julia 0.6 after SortingAlgorithms is loaded.
Expand All @@ -369,18 +391,24 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T
end

# fall-back for `x` an iterator
function addcounts_radixsort!(cm::Dict{T}, x) where T
sx = sort!(collect(x), alg = RadixSort)
function addcounts_radixsort!(cm::Dict{T}, x) where T
cx = vec(collect(x))
sx = sort!(cx, alg = _alg(cx))
return _addcounts_radix_sort_loop!(cm, sx)
end

function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real}
n = length(x)
length(wv) == n || throw(DimensionMismatch())
# add wv weighted counts of integers from x to cm

length(x) == length(wv) ||
throw(DimensionMismatch("x and wv must have the same length, got $(length(x)) and $(length(wv))"))

xv = vec(x) # discard shape because weights() discards shape

z = zero(W)

for i = 1 : n
@inbounds xi = x[i]
for i in eachindex(xv, wv)
@inbounds xi = xv[i]
@inbounds wi = wv[i]
cm[xi] = get(cm, xi, z) + wi
end
Expand All @@ -390,11 +418,14 @@ end

"""
countmap(x; alg = :auto)
countmap(x::AbstractVector, w::AbstractVector{<:Real}; alg = :auto)
countmap(x::AbstractVector, wv::AbstractVector{<:Real})
Return a dictionary mapping each unique value in `x` to its number
of occurrences. A vector of weights `w` can be provided when `x` is a vector.
Return a dictionary mapping each unique value in `x` to its number of occurrences.
If a weighting vector `wv` is specified, the sum of weights is used rather than the
raw counts.
`alg` is only allowed for unweighted counting and can be one of:
- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
`:radixsort`, otherwise use `:dict`.
Expand All @@ -414,9 +445,12 @@ countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcoun

"""
proportionmap(x)
proportionmap(x::AbstractVector, w::AbstractVector{<:Real})
Return a dictionary mapping each unique value in `x` to its proportion in `x`.
Return a dictionary mapping each unique value in `x` to its
proportion in `x`.
If a vector of weights `wv` is provided, the proportion of weights is computed rather
than the proportion of raw counts.
"""
proportionmap(x::AbstractArray) = _normalize_countmap(countmap(x), length(x))
proportionmap(x::AbstractArray, wv::AbstractWeights) = _normalize_countmap(countmap(x, wv), sum(wv))
2 changes: 2 additions & 0 deletions src/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ length(wv::AbstractWeights) = length(wv.values)
sum(wv::AbstractWeights) = wv.sum
isempty(wv::AbstractWeights) = isempty(wv.values)
size(wv::AbstractWeights) = size(wv.values)
Base.axes(wv::AbstractWeights) = Base.axes(wv.values)

Base.dataids(wv::AbstractWeights) = Base.dataids(wv.values)

Expand Down Expand Up @@ -301,6 +302,7 @@ sum(wv::UnitWeights{T}) where T = convert(T, length(wv))
isempty(wv::UnitWeights) = iszero(wv.len)
length(wv::UnitWeights) = wv.len
size(wv::UnitWeights) = tuple(length(wv))
Base.axes(wv::UnitWeights) = tuple(Base.OneTo(length(wv)))

Base.convert(::Type{Vector}, wv::UnitWeights{T}) where {T} = ones(T, length(wv))

Expand Down
Loading

0 comments on commit f0cccd6

Please sign in to comment.