diff --git a/src/uproot/writing/_cascadetree.py b/src/uproot/writing/_cascadetree.py index e05dd45d5..e8e53cd59 100644 --- a/src/uproot/writing/_cascadetree.py +++ b/src/uproot/writing/_cascadetree.py @@ -40,6 +40,7 @@ numpy.dtype(">u8"): "l", numpy.dtype(">f4"): "F", numpy.dtype(">f8"): "D", + numpy.dtype(">U"): "C", } @@ -190,7 +191,11 @@ def __init__( parameters = {} if parameters.get("__array__") == "string": - raise NotImplementedError("array of strings") + if branch_name not in self._branch_lookup: + self._branch_lookup[branch_name] = len(self._branch_data) + self._branch_data.append( + self._branch_np(branch_name, branch_type, numpy.dtype(str)) + ) elif parameters.get("__array__") == "bytes": raise NotImplementedError("array of bytes") @@ -672,21 +677,63 @@ def extend(self, file, sink, data): continue if datum["counter"] is None: - big_endian = numpy.asarray(branch_array, dtype=datum["dtype"]) - if big_endian.shape != (len(branch_array),) + datum["shape"]: - raise ValueError( - "'extend' must fill branches with a consistent shape: has {}, trying to fill with {}".format( - datum["shape"], - big_endian.shape[1:], - ) + if datum["dtype"] == ">U0": + lengths = numpy.asarray(awkward.num(branch_array.layout)) + which_big = lengths >= 255 + + lengths_extension_offsets = numpy.empty( + len(branch_array.layout) + 1, numpy.int64 ) - tofill.append((branch_name, datum["compression"], big_endian, None)) + lengths_extension_offsets[0] = 0 + numpy.cumsum(which_big * 4, out=lengths_extension_offsets[1:]) + + lengths_extension = awkward.contents.ListOffsetArray( + awkward.index.Index64(lengths_extension_offsets), + awkward.contents.NumpyArray( + lengths[which_big].astype(">u4").view("u1") + ), + ) + + lengths[which_big] = 255 - if datum["kind"] == "counter": - datum["tleaf_maximum_value"] = max( - big_endian.max(), datum["tleaf_maximum_value"] + leafc_data_awkward = awkward.concatenate( + [ + lengths.reshape(-1, 1).astype("u1"), + lengths_extension, + awkward.without_parameters(branch_array.layout), + ], + axis=1, ) + big_endian = numpy.asarray(awkward.flatten(leafc_data_awkward)) + big_endian_offsets = ( + lengths_extension_offsets + + numpy.asarray(branch_array.layout.offsets) + + numpy.arange(len(branch_array.layout.offsets)) + ).astype(">i4", copy=True) + tofill.append( + ( + branch_name, + datum["compression"], + big_endian, + big_endian_offsets, + ) + ) + else: + big_endian = numpy.asarray(branch_array, dtype=datum["dtype"]) + if big_endian.shape != (len(branch_array),) + datum["shape"]: + raise ValueError( + "'extend' must fill branches with a consistent shape: has {}, trying to fill with {}".format( + datum["shape"], + big_endian.shape[1:], + ) + ) + tofill.append((branch_name, datum["compression"], big_endian, None)) + if datum["kind"] == "counter": + datum["tleaf_maximum_value"] = max( + big_endian.max(), datum["tleaf_maximum_value"] + ) + else: try: awkward = uproot.extras.awkward() @@ -760,7 +807,13 @@ def extend(self, file, sink, data): for branch_name, compression, big_endian, big_endian_offsets in tofill: datum = self._branch_data[self._branch_lookup[branch_name]] - if big_endian_offsets is None: + if datum["dtype"] == ">U0": + totbytes, zipbytes, location = self.write_string_basket( + sink, branch_name, compression, big_endian, big_endian_offsets + ) + datum["fEntryOffsetLen"] = 4 * (len(big_endian_offsets) - 1) + + elif big_endian_offsets is None: totbytes, zipbytes, location = self.write_np_basket( sink, branch_name, compression, big_endian ) @@ -965,6 +1018,9 @@ def write_anew(self, sink): special_struct = uproot.models.TLeaf._tleaff1_format1 elif letter_upper == "D": special_struct = uproot.models.TLeaf._tleafd1_format1 + elif letter_upper == "C": + special_struct = uproot.models.TLeaf._tleafc1_format1 + fLenType = datum["dtype"].itemsize fIsUnsigned = letter != letter_upper @@ -1273,6 +1329,27 @@ def write_updates(self, sink): datum["arrays_write_start"] = datum["arrays_write_stop"] + if datum["dtype"] == ">U0": + position = ( + base + + datum["basket_metadata_start"] + - 25 # empty TObjArray of fBaskets (embedded) + - 8 # specialized TLeaf* members (fMinimum, fMaximum) + - 4 # null fLeafCount + - 14 # generic TLeaf members + ) + sink.write( + position, + uproot.models.TLeaf._tleaf2_format0.pack( + self._metadata["fLen"], + datum["dtype"].itemsize, + 0, + datum["kind"] == "counter", + _dtype_to_char[datum["dtype"]] + != _dtype_to_char[datum["dtype"]].upper(), + ), + ) + if datum["kind"] == "counter": position = ( base @@ -1429,6 +1506,96 @@ def write_jagged_basket(self, sink, branch_name, compression, array, offsets): return fKeylen + fObjlen, fNbytes, location + def write_string_basket(self, sink, branch_name, compression, array, offsets): + fClassName = uproot.serialization.string("TBasket") + fName = uproot.serialization.string(branch_name) + fTitle = uproot.serialization.string(self._name) + fKeylen = ( + uproot.reading._key_format_big.size + + len(fClassName) + + len(fName) + + len(fTitle) + + uproot.models.TBasket._tbasket_format2.size + + 1 + ) + + itemsize = array.dtype.itemsize + for item in array.shape[1:]: + itemsize *= item + try: + uproot.extras.awkward() + except ModuleNotFoundError as err: + raise TypeError( + f"'awkward' cannot be imported: {self._branch_type!r}" + ) from err + + offsets *= itemsize + offsets += fKeylen + + raw_array = uproot._util.tobytes(array) + raw_offsets = uproot._util.tobytes(offsets) + uncompressed_data = ( + raw_array + + _tbasket_offsets_length.pack(len(offsets)) + + raw_offsets[:-4] + + b"\x00\x00\x00\x00" + ) + compressed_data = uproot.compression.compress(uncompressed_data, compression) + + # get size of biggest string + self._metadata["fLen"] = ( + 0 + if len(offsets) == 1 + else max([offsets[i + 1] - offsets[i] for i in range(len(offsets) - 1)]) + ) + + fLast = offsets[-1] + offsets[-1] = 0 + + fObjlen = len(uncompressed_data) + fNbytes = fKeylen + len(compressed_data) + + parent_location = self._directory.key.location # FIXME: is this correct? + + location = self._freesegments.allocate(fNbytes, dry_run=False) + + out = [] + out.append( + uproot.reading._key_format_big.pack( + fNbytes, + 1004, # fVersion + fObjlen, + uproot._util.datetime_to_code(datetime.datetime.now()), # fDatime + fKeylen, + 0, # fCycle + location, # fSeekKey + parent_location, # fSeekPdir + ) + ) + out.append(fClassName) + out.append(fName) + out.append(fTitle) + out.append( + uproot.models.TBasket._tbasket_format2.pack( + 3, # fVersion + 32000, # fBufferSize + 1000, # fNevBufSize + len(offsets) - 1, # fNevBuf + fLast, + ) + ) + + out.append(b"\x00") # part of the Key (included in fKeylen, at least) + + out.append(compressed_data) + + sink.write(location, b"".join(out)) + self._freesegments.write(sink) + sink.set_file_length(self._freesegments.fileheader.end) + sink.flush() + + return fKeylen + fObjlen, fNbytes, location + _tbasket_offsets_length = struct.Struct(">I") diff --git a/tests/test_0940-feat-add-TLeafC-string-support.py b/tests/test_0940-feat-add-TLeafC-string-support.py new file mode 100644 index 000000000..ad1375657 --- /dev/null +++ b/tests/test_0940-feat-add-TLeafC-string-support.py @@ -0,0 +1,211 @@ +import pytest +import os +import awkward as ak +import uproot + +ROOT = pytest.importorskip("ROOT") + + +def test_write_tfleac_uproot_1(tmp_path): + filename = os.path.join(tmp_path, "tleafc_test_write_1.root") + + with uproot.recreate(filename) as f: + array = ak.Array(["one", "two", "three"]) + f["tree"] = {"branch": array} + + rf = ROOT.TFile(filename) + data = rf.Get("tree") + assert data.GetLeaf("branch").Class_Name() == "TLeafC" + assert [entry.branch for entry in data] == ["one", "two", "three"] + rf.Close() + + with uproot.open(filename) as g: + assert g["tree"]["branch"].array().tolist() == ["one", "two", "three"] + + +def test_write_tfleac_uproot_2(tmp_path): + filename = os.path.join(tmp_path, "tleafc_test_write_2.root") + + with uproot.recreate(filename) as f: + array = ak.Array( + ["unu", "doi", "trei", "patru", "cinci", "sase", "sapte", "opt"] + ) + f["tree"] = {"branch": array} + + rf = ROOT.TFile(filename) + data = rf.Get("tree") + assert data.GetLeaf("branch").Class_Name() == "TLeafC" + assert [entry.branch for entry in data] == [ + "unu", + "doi", + "trei", + "patru", + "cinci", + "sase", + "sapte", + "opt", + ] + rf.Close() + + with uproot.open(filename) as g: + assert g["tree"]["branch"].array().tolist() == [ + "unu", + "doi", + "trei", + "patru", + "cinci", + "sase", + "sapte", + "opt", + ] + + +def test_write_tfleac_uproot_3(tmp_path): + filename = os.path.join(tmp_path, "tleafc_test_write_3.root") + + with uproot.recreate(filename) as f: + array = ak.Array(["zero", "one" * 100, "two", "three" * 100, "four", "five"]) + f["tree"] = {"branch": array} + + rf = ROOT.TFile(filename) + data = rf.Get("tree") + assert data.GetLeaf("branch").Class_Name() == "TLeafC" + assert [entry.branch for entry in data] == [ + "zero", + "one" * 100, + "two", + "three" * 100, + "four", + "five", + ] + rf.Close() + + with uproot.open(filename) as g: + assert g["tree"]["branch"].array().tolist() == [ + "zero", + "one" * 100, + "two", + "three" * 100, + "four", + "five", + ] + + +def test_empty_array(tmp_path): + filename = os.path.join(tmp_path, "empty-array.root") + + with uproot.recreate(filename) as outfile: + array = ak.Array(["one", "two", "three"])[0:0] # type=string but len=0 + outfile["tree"] = {"branch": array} + + root_infile = ROOT.TFile(filename) + root_tree = root_infile.Get("tree") + assert root_tree.GetLeaf("branch").Class_Name() == "TLeafC" + assert [entry.branch for entry in root_tree] == [] + root_infile.Close() + + with uproot.open(filename) as infile: + array = infile["tree"]["branch"].array() + assert array.tolist() == [] + assert str(array.type) == "0 * string" + + +def test_empty_array_2baskets(tmp_path): + filename = os.path.join(tmp_path, "empty-array.root") + + with uproot.recreate(filename) as outfile: + array = ak.Array(["one", "two", "three"])[0:0] # type=string but len=0 + outfile["tree"] = {"branch": array} + outfile["tree"].extend({"branch": array}) + + root_infile = ROOT.TFile(filename) + root_tree = root_infile.Get("tree") + assert root_tree.GetLeaf("branch").Class_Name() == "TLeafC" + assert [entry.branch for entry in root_tree] == [] + root_infile.Close() + + with uproot.open(filename) as infile: + array = infile["tree"]["branch"].array() + assert array.tolist() == [] + assert str(array.type) == "0 * string" + + +def test_mutating_fLen(tmp_path): + filename = os.path.join(tmp_path, "mutating-fLen.root") + + with uproot.recreate(filename) as outfile: + aslist = ["x" * 1] + array = ak.Array(aslist) + outfile["tree"] = {"branch": array} + num_baskets = 1 + + with uproot.open(filename) as infile: + assert len(infile.keys()) == 1 + branch = infile["tree"]["branch"] + assert branch.array().tolist() == aslist + assert branch.member("fLeaves")[0].member("fLen") == 2 + assert len(branch.member("fBasketEntry")) == 10 + assert branch.num_baskets == num_baskets + + for i in range(2, 10): + aslist.extend(["x" * i]) + outfile["tree"].extend({"branch": ak.Array(["x" * i])}) + num_baskets += 1 + + with uproot.open(filename) as infile: + assert len(infile.keys()) == 1 + branch = infile["tree"]["branch"] + assert branch.array().tolist() == aslist + # verify that fLen is mutated in-place as we add TBaskets + assert branch.member("fLeaves")[0].member("fLen") == i + 1 + assert len(branch.member("fBasketEntry")) == 10 + assert branch.num_baskets == num_baskets + + for i in range(10, 100): + aslist.extend(["x" * i]) + outfile["tree"].extend({"branch": ak.Array(["x" * i])}) + num_baskets += 1 + + with uproot.open(filename) as infile: + # verify that this is still the case after write_anew + # (increasing fBasketEntry capacity means rewriting metadata) + assert len(infile.keys()) == 1 + branch = infile["tree"]["branch"] + assert branch.array().tolist() == aslist + assert branch.member("fLeaves")[0].member("fLen") == i + 1 + assert len(branch.member("fBasketEntry")) == 100 + assert branch.num_baskets == num_baskets + + for i in range(100, 255, 5): + aslist.extend(["x" * i]) + outfile["tree"].extend({"branch": ak.Array(["x" * i])}) + num_baskets += 1 + + with uproot.open(filename) as infile: + # same, but now in the 100 -> 1000 TBasket range + assert len(infile.keys()) == 1 + branch = infile["tree"]["branch"] + assert branch.array().tolist() == aslist + assert branch.member("fLeaves")[0].member("fLen") == i + 1 + assert len(branch.member("fBasketEntry")) == 1000 + assert branch.num_baskets == num_baskets + + for i in range(255, 265, 5): + aslist.extend(["x" * i]) + outfile["tree"].extend({"branch": ak.Array(["x" * i])}) + num_baskets += 1 + + with uproot.open(filename) as infile: + # oh, but when the string length exceeds 255, fLen == i + 5 + assert len(infile.keys()) == 1 + branch = infile["tree"]["branch"] + assert branch.array().tolist() == aslist + assert branch.member("fLeaves")[0].member("fLen") == i + 5 + assert len(branch.member("fBasketEntry")) == 1000 + assert branch.num_baskets == num_baskets + + # verify that ROOT is still happy with all of this + root_infile = ROOT.TFile(filename) + root_tree = root_infile.Get("tree") + assert [entry.branch for entry in root_tree] == aslist + root_infile.Close()