Skip to content

Commit

Permalink
feat: add TLeafC - string - writing support (#940)
Browse files Browse the repository at this point in the history
* feat: add TLeafC - string - writing support

* style: pre-commit fixes

* Rename test file to match PR nr.

* Fixes logic for string offset size and adds test for strings with more than 254 bytes.

* Check files created can be read with ROOT also and make the uncompressed data ignore the last offset as ROOT does.

* Hard code the fix for the tests just to show where the problem comes from.

* Adds fLen computation and updating in the case of writing TLeafC.

* style: pre-commit fixes

* Reverts fLen to 1.

* Moves computation of fLen max to strings.

* style: pre-commit fixes

* Add some stress-tests for extremes and corner-cases.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jim Pivarski <jpivarski@gmail.com>
  • Loading branch information
3 people authored Sep 25, 2023
1 parent d939d21 commit 1d85f27
Show file tree
Hide file tree
Showing 2 changed files with 391 additions and 13 deletions.
193 changes: 180 additions & 13 deletions src/uproot/writing/_cascadetree.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
numpy.dtype(">u8"): "l",
numpy.dtype(">f4"): "F",
numpy.dtype(">f8"): "D",
numpy.dtype(">U"): "C",
}


Expand Down Expand Up @@ -190,7 +191,11 @@ def __init__(
parameters = {}

if parameters.get("__array__") == "string":
raise NotImplementedError("array of strings")
if branch_name not in self._branch_lookup:
self._branch_lookup[branch_name] = len(self._branch_data)
self._branch_data.append(
self._branch_np(branch_name, branch_type, numpy.dtype(str))
)

elif parameters.get("__array__") == "bytes":
raise NotImplementedError("array of bytes")
Expand Down Expand Up @@ -672,21 +677,63 @@ def extend(self, file, sink, data):
continue

if datum["counter"] is None:
big_endian = numpy.asarray(branch_array, dtype=datum["dtype"])
if big_endian.shape != (len(branch_array),) + datum["shape"]:
raise ValueError(
"'extend' must fill branches with a consistent shape: has {}, trying to fill with {}".format(
datum["shape"],
big_endian.shape[1:],
)
if datum["dtype"] == ">U0":
lengths = numpy.asarray(awkward.num(branch_array.layout))
which_big = lengths >= 255

lengths_extension_offsets = numpy.empty(
len(branch_array.layout) + 1, numpy.int64
)
tofill.append((branch_name, datum["compression"], big_endian, None))
lengths_extension_offsets[0] = 0
numpy.cumsum(which_big * 4, out=lengths_extension_offsets[1:])

lengths_extension = awkward.contents.ListOffsetArray(
awkward.index.Index64(lengths_extension_offsets),
awkward.contents.NumpyArray(
lengths[which_big].astype(">u4").view("u1")
),
)

lengths[which_big] = 255

if datum["kind"] == "counter":
datum["tleaf_maximum_value"] = max(
big_endian.max(), datum["tleaf_maximum_value"]
leafc_data_awkward = awkward.concatenate(
[
lengths.reshape(-1, 1).astype("u1"),
lengths_extension,
awkward.without_parameters(branch_array.layout),
],
axis=1,
)

big_endian = numpy.asarray(awkward.flatten(leafc_data_awkward))
big_endian_offsets = (
lengths_extension_offsets
+ numpy.asarray(branch_array.layout.offsets)
+ numpy.arange(len(branch_array.layout.offsets))
).astype(">i4", copy=True)
tofill.append(
(
branch_name,
datum["compression"],
big_endian,
big_endian_offsets,
)
)
else:
big_endian = numpy.asarray(branch_array, dtype=datum["dtype"])
if big_endian.shape != (len(branch_array),) + datum["shape"]:
raise ValueError(
"'extend' must fill branches with a consistent shape: has {}, trying to fill with {}".format(
datum["shape"],
big_endian.shape[1:],
)
)
tofill.append((branch_name, datum["compression"], big_endian, None))
if datum["kind"] == "counter":
datum["tleaf_maximum_value"] = max(
big_endian.max(), datum["tleaf_maximum_value"]
)

else:
try:
awkward = uproot.extras.awkward()
Expand Down Expand Up @@ -760,7 +807,13 @@ def extend(self, file, sink, data):
for branch_name, compression, big_endian, big_endian_offsets in tofill:
datum = self._branch_data[self._branch_lookup[branch_name]]

if big_endian_offsets is None:
if datum["dtype"] == ">U0":
totbytes, zipbytes, location = self.write_string_basket(
sink, branch_name, compression, big_endian, big_endian_offsets
)
datum["fEntryOffsetLen"] = 4 * (len(big_endian_offsets) - 1)

elif big_endian_offsets is None:
totbytes, zipbytes, location = self.write_np_basket(
sink, branch_name, compression, big_endian
)
Expand Down Expand Up @@ -965,6 +1018,9 @@ def write_anew(self, sink):
special_struct = uproot.models.TLeaf._tleaff1_format1
elif letter_upper == "D":
special_struct = uproot.models.TLeaf._tleafd1_format1
elif letter_upper == "C":
special_struct = uproot.models.TLeaf._tleafc1_format1

fLenType = datum["dtype"].itemsize
fIsUnsigned = letter != letter_upper

Expand Down Expand Up @@ -1273,6 +1329,27 @@ def write_updates(self, sink):

datum["arrays_write_start"] = datum["arrays_write_stop"]

if datum["dtype"] == ">U0":
position = (
base
+ datum["basket_metadata_start"]
- 25 # empty TObjArray of fBaskets (embedded)
- 8 # specialized TLeaf* members (fMinimum, fMaximum)
- 4 # null fLeafCount
- 14 # generic TLeaf members
)
sink.write(
position,
uproot.models.TLeaf._tleaf2_format0.pack(
self._metadata["fLen"],
datum["dtype"].itemsize,
0,
datum["kind"] == "counter",
_dtype_to_char[datum["dtype"]]
!= _dtype_to_char[datum["dtype"]].upper(),
),
)

if datum["kind"] == "counter":
position = (
base
Expand Down Expand Up @@ -1429,6 +1506,96 @@ def write_jagged_basket(self, sink, branch_name, compression, array, offsets):

return fKeylen + fObjlen, fNbytes, location

def write_string_basket(self, sink, branch_name, compression, array, offsets):
fClassName = uproot.serialization.string("TBasket")
fName = uproot.serialization.string(branch_name)
fTitle = uproot.serialization.string(self._name)
fKeylen = (
uproot.reading._key_format_big.size
+ len(fClassName)
+ len(fName)
+ len(fTitle)
+ uproot.models.TBasket._tbasket_format2.size
+ 1
)

itemsize = array.dtype.itemsize
for item in array.shape[1:]:
itemsize *= item
try:
uproot.extras.awkward()
except ModuleNotFoundError as err:
raise TypeError(
f"'awkward' cannot be imported: {self._branch_type!r}"
) from err

offsets *= itemsize
offsets += fKeylen

raw_array = uproot._util.tobytes(array)
raw_offsets = uproot._util.tobytes(offsets)
uncompressed_data = (
raw_array
+ _tbasket_offsets_length.pack(len(offsets))
+ raw_offsets[:-4]
+ b"\x00\x00\x00\x00"
)
compressed_data = uproot.compression.compress(uncompressed_data, compression)

# get size of biggest string
self._metadata["fLen"] = (
0
if len(offsets) == 1
else max([offsets[i + 1] - offsets[i] for i in range(len(offsets) - 1)])
)

fLast = offsets[-1]
offsets[-1] = 0

fObjlen = len(uncompressed_data)
fNbytes = fKeylen + len(compressed_data)

parent_location = self._directory.key.location # FIXME: is this correct?

location = self._freesegments.allocate(fNbytes, dry_run=False)

out = []
out.append(
uproot.reading._key_format_big.pack(
fNbytes,
1004, # fVersion
fObjlen,
uproot._util.datetime_to_code(datetime.datetime.now()), # fDatime
fKeylen,
0, # fCycle
location, # fSeekKey
parent_location, # fSeekPdir
)
)
out.append(fClassName)
out.append(fName)
out.append(fTitle)
out.append(
uproot.models.TBasket._tbasket_format2.pack(
3, # fVersion
32000, # fBufferSize
1000, # fNevBufSize
len(offsets) - 1, # fNevBuf
fLast,
)
)

out.append(b"\x00") # part of the Key (included in fKeylen, at least)

out.append(compressed_data)

sink.write(location, b"".join(out))
self._freesegments.write(sink)
sink.set_file_length(self._freesegments.fileheader.end)
sink.flush()

return fKeylen + fObjlen, fNbytes, location


_tbasket_offsets_length = struct.Struct(">I")

Expand Down
Loading

0 comments on commit 1d85f27

Please sign in to comment.