Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model conversion support for T5 and FLAN-T5 model variants #8055

Merged
merged 8 commits into from
Jun 24, 2024
124 changes: 123 additions & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self.tensor_names = None
if self.ftype == gguf.LlamaFileType.GUESSED:
Expand Down Expand Up @@ -2729,6 +2729,128 @@ def write_tensors(self):
raise ValueError(f"Unprocessed experts: {experts}")


@Model.register("T5ForConditionalGeneration")
@Model.register("T5WithLMHeadModel")
class T5Model(Model):
model_arch = gguf.MODEL_ARCH.T5

def set_vocab(self):
# to avoid TypeError: Descriptors cannot be created directly
# exception when importing sentencepiece_model_pb2
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import SentencePieceProcessor
from sentencepiece import sentencepiece_model_pb2 as model

tokenizer_path = self.dir_model / 'spiece.model'

tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
compilade marked this conversation as resolved.
Show resolved Hide resolved

if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")

sentencepiece_model = model.ModelProto()
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM

tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))

vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())

tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size

for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)

toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE

tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype

added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue

tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED

if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)

self.gguf_writer.add_tokenizer_model("t5")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_add_space_prefix(add_prefix)
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
if precompiled_charsmap:
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)

self.gguf_writer.add_add_bos_token(False)
self.gguf_writer.add_add_eos_token(True)

def set_gguf_parameters(self):
self.gguf_writer.add_name("T5")
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
self.gguf_writer.add_block_count(self.hparams["num_layers"])
self.gguf_writer.add_head_count(self.hparams["num_heads"])
self.gguf_writer.add_key_length(self.hparams["d_kv"])
self.gguf_writer.add_value_length(self.hparams["d_kv"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
self.gguf_writer.add_file_type(self.ftype)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused

# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
return []

return [(self.map_tensor_name(name), data_torch)]


###### CONVERSION LOGIC ######


Expand Down
94 changes: 94 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class LLM:
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale"
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a specific reason why the decoder_start_token_id isn't with the rest of the tokenizer config (like e.g. tokenizer.ggml.bos_token_id)?

In what way is it different from tokenizer.ggml.bos_token_id? When is it used?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's different. It's not related to the tokenizer at all, it's a model parameter. Decoder start token is not a separate specific token like BOS, EOS or PAD. It's used in encoder-decoder models like T5 as an initial starting token of the autoregressive decoding process. The model creators decided to use one of the existing tokens as the decoder start token (PAD in case of T5) and id of this token is stored in this parameter.


class Attention:
HEAD_COUNT = "{arch}.attention.head_count"
Expand All @@ -62,6 +63,7 @@ class Attention:
CAUSAL = "{arch}.attention.causal"
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"

class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
Expand Down Expand Up @@ -97,6 +99,8 @@ class Tokenizer:
ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token"
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
HF_JSON = "tokenizer.huggingface.json"
RWKV = "tokenizer.rwkv.world"
CHAT_TEMPLATE = "tokenizer.chat_template"
Expand Down Expand Up @@ -149,6 +153,7 @@ class MODEL_ARCH(IntEnum):
OLMO = auto()
ARCTIC = auto()
DEEPSEEK2 = auto()
T5 = auto()


class MODEL_TENSOR(IntEnum):
Expand Down Expand Up @@ -200,6 +205,34 @@ class MODEL_TENSOR(IntEnum):
ATTN_KV_B = auto()
ATTN_Q_A_NORM = auto()
ATTN_KV_A_NORM = auto()
DEC_ATTN_NORM = auto()
DEC_ATTN_Q = auto()
DEC_ATTN_K = auto()
DEC_ATTN_V = auto()
DEC_ATTN_OUT = auto()
DEC_ATTN_REL_B = auto()
DEC_CROSS_ATTN_NORM = auto()
DEC_CROSS_ATTN_Q = auto()
DEC_CROSS_ATTN_K = auto()
DEC_CROSS_ATTN_V = auto()
DEC_CROSS_ATTN_OUT = auto()
DEC_CROSS_ATTN_REL_B = auto()
DEC_FFN_NORM = auto()
DEC_FFN_GATE = auto()
DEC_FFN_DOWN = auto()
DEC_FFN_UP = auto()
DEC_OUTPUT_NORM = auto()
ENC_ATTN_NORM = auto()
ENC_ATTN_Q = auto()
ENC_ATTN_K = auto()
ENC_ATTN_V = auto()
ENC_ATTN_OUT = auto()
ENC_ATTN_REL_B = auto()
ENC_FFN_NORM = auto()
ENC_FFN_GATE = auto()
ENC_FFN_DOWN = auto()
ENC_FFN_UP = auto()
ENC_OUTPUT_NORM = auto()


MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
Expand Down Expand Up @@ -237,6 +270,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.T5: "t5",
}

TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
Expand Down Expand Up @@ -288,6 +322,34 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
Comment on lines +347 to +348
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The enc and dec prefixes will (eventually) need to be also handled by the new markdown output mode of gguf-dump.py (#7853).

Can be fixed in a separate PR, I'm mentioning this for future reference.

(@mofosyne, you should be aware of this)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@compilade I tried it on one example model python3 gguf-py/scripts/gguf-dump.py --markdown /mnt/md0/models/t5-small.gguf and I'm not sure what could be fixed, can you be more specific?

T_ID Tensor Layer Name Human Friendly Tensor Layer Name Elements Shape Type
0 dec.blk.0.attn_k.weight Dec Block 0 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
1 dec.blk.0.attn_o.weight Dec Block 0 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
2 dec.blk.0.attn_q.weight Dec Block 0 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
3 dec.blk.0.attn_rel_b.weight Dec Block 0 Attn_Rel_B (W) ( 256) 256 8 x 32 x 1 x 1 F16
4 dec.blk.0.attn_v.weight Dec Block 0 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
5 dec.blk.0.attn_norm.weight Dec Block 0 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
6 dec.blk.0.cross_attn_k.weight Dec Block 0 Cross_Attn_K (W) (~262K) 262144 512 x 512 x 1 x 1 F16
7 dec.blk.0.cross_attn_o.weight Dec Block 0 Cross_Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
8 dec.blk.0.cross_attn_q.weight Dec Block 0 Cross_Attn_Q (W) (~262K) 262144 512 x 512 x 1 x 1 F16
9 dec.blk.0.cross_attn_rel_b.weight Dec Block 0 Cross_Attn_Rel_B (W) ( 256) 256 8 x 32 x 1 x 1 F16
10 dec.blk.0.cross_attn_v.weight Dec Block 0 Cross_Attn_V (W) (~262K) 262144 512 x 512 x 1 x 1 F16
11 dec.blk.0.cross_attn_norm.weight Dec Block 0 Cross_Attn_Norm (W) ( 512) 512 512 x 1 x 1 x 1 F32
12 dec.blk.0.ffn_up.weight Dec Block 0 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
13 dec.blk.0.ffn_down.weight Dec Block 0 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
14 dec.blk.0.ffn_norm.weight Dec Block 0 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
15 dec.blk.1.attn_k.weight Dec Block 1 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
16 dec.blk.1.attn_o.weight Dec Block 1 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
17 dec.blk.1.attn_q.weight Dec Block 1 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
18 dec.blk.1.attn_v.weight Dec Block 1 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
19 dec.blk.1.attn_norm.weight Dec Block 1 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
20 dec.blk.1.cross_attn_k.weight Dec Block 1 Cross_Attn_K (W) (~262K) 262144 512 x 512 x 1 x 1 F16
21 dec.blk.1.cross_attn_o.weight Dec Block 1 Cross_Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
22 dec.blk.1.cross_attn_q.weight Dec Block 1 Cross_Attn_Q (W) (~262K) 262144 512 x 512 x 1 x 1 F16
23 dec.blk.1.cross_attn_v.weight Dec Block 1 Cross_Attn_V (W) (~262K) 262144 512 x 512 x 1 x 1 F16
24 dec.blk.1.cross_attn_norm.weight Dec Block 1 Cross_Attn_Norm (W) ( 512) 512 512 x 1 x 1 x 1 F32
25 dec.blk.1.ffn_up.weight Dec Block 1 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
26 dec.blk.1.ffn_down.weight Dec Block 1 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
27 dec.blk.1.ffn_norm.weight Dec Block 1 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
28 dec.blk.2.attn_k.weight Dec Block 2 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
29 dec.blk.2.attn_o.weight Dec Block 2 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
30 dec.blk.2.attn_q.weight Dec Block 2 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
31 dec.blk.2.attn_v.weight Dec Block 2 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
32 dec.blk.2.attn_norm.weight Dec Block 2 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
33 dec.blk.2.cross_attn_k.weight Dec Block 2 Cross_Attn_K (W) (~262K) 262144 512 x 512 x 1 x 1 F16
34 dec.blk.2.cross_attn_o.weight Dec Block 2 Cross_Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
35 dec.blk.2.cross_attn_q.weight Dec Block 2 Cross_Attn_Q (W) (~262K) 262144 512 x 512 x 1 x 1 F16
36 dec.blk.2.cross_attn_v.weight Dec Block 2 Cross_Attn_V (W) (~262K) 262144 512 x 512 x 1 x 1 F16
37 dec.blk.2.cross_attn_norm.weight Dec Block 2 Cross_Attn_Norm (W) ( 512) 512 512 x 1 x 1 x 1 F32
38 dec.blk.2.ffn_up.weight Dec Block 2 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
39 dec.blk.2.ffn_down.weight Dec Block 2 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
40 dec.blk.2.ffn_norm.weight Dec Block 2 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
41 dec.blk.3.attn_k.weight Dec Block 3 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
42 dec.blk.3.attn_o.weight Dec Block 3 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
43 dec.blk.3.attn_q.weight Dec Block 3 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
44 dec.blk.3.attn_v.weight Dec Block 3 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
45 dec.blk.3.attn_norm.weight Dec Block 3 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
46 dec.blk.3.cross_attn_k.weight Dec Block 3 Cross_Attn_K (W) (~262K) 262144 512 x 512 x 1 x 1 F16
47 dec.blk.3.cross_attn_o.weight Dec Block 3 Cross_Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
48 dec.blk.3.cross_attn_q.weight Dec Block 3 Cross_Attn_Q (W) (~262K) 262144 512 x 512 x 1 x 1 F16
49 dec.blk.3.cross_attn_v.weight Dec Block 3 Cross_Attn_V (W) (~262K) 262144 512 x 512 x 1 x 1 F16
50 dec.blk.3.cross_attn_norm.weight Dec Block 3 Cross_Attn_Norm (W) ( 512) 512 512 x 1 x 1 x 1 F32
51 dec.blk.3.ffn_up.weight Dec Block 3 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
52 dec.blk.3.ffn_down.weight Dec Block 3 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
53 dec.blk.3.ffn_norm.weight Dec Block 3 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
54 dec.blk.4.attn_k.weight Dec Block 4 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
55 dec.blk.4.attn_o.weight Dec Block 4 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
56 dec.blk.4.attn_q.weight Dec Block 4 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
57 dec.blk.4.attn_v.weight Dec Block 4 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
58 dec.blk.4.attn_norm.weight Dec Block 4 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
59 dec.blk.4.cross_attn_k.weight Dec Block 4 Cross_Attn_K (W) (~262K) 262144 512 x 512 x 1 x 1 F16
60 dec.blk.4.cross_attn_o.weight Dec Block 4 Cross_Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
61 dec.blk.4.cross_attn_q.weight Dec Block 4 Cross_Attn_Q (W) (~262K) 262144 512 x 512 x 1 x 1 F16
62 dec.blk.4.cross_attn_v.weight Dec Block 4 Cross_Attn_V (W) (~262K) 262144 512 x 512 x 1 x 1 F16
63 dec.blk.4.cross_attn_norm.weight Dec Block 4 Cross_Attn_Norm (W) ( 512) 512 512 x 1 x 1 x 1 F32
64 dec.blk.4.ffn_up.weight Dec Block 4 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
65 dec.blk.4.ffn_down.weight Dec Block 4 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
66 dec.blk.4.ffn_norm.weight Dec Block 4 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
67 dec.blk.5.attn_k.weight Dec Block 5 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
68 dec.blk.5.attn_o.weight Dec Block 5 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
69 dec.blk.5.attn_q.weight Dec Block 5 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
70 dec.blk.5.attn_v.weight Dec Block 5 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
71 dec.blk.5.attn_norm.weight Dec Block 5 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
72 dec.blk.5.cross_attn_k.weight Dec Block 5 Cross_Attn_K (W) (~262K) 262144 512 x 512 x 1 x 1 F16
73 dec.blk.5.cross_attn_o.weight Dec Block 5 Cross_Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
74 dec.blk.5.cross_attn_q.weight Dec Block 5 Cross_Attn_Q (W) (~262K) 262144 512 x 512 x 1 x 1 F16
75 dec.blk.5.cross_attn_v.weight Dec Block 5 Cross_Attn_V (W) (~262K) 262144 512 x 512 x 1 x 1 F16
76 dec.blk.5.cross_attn_norm.weight Dec Block 5 Cross_Attn_Norm (W) ( 512) 512 512 x 1 x 1 x 1 F32
77 dec.blk.5.ffn_up.weight Dec Block 5 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
78 dec.blk.5.ffn_down.weight Dec Block 5 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
79 dec.blk.5.ffn_norm.weight Dec Block 5 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
80 dec.output_norm.weight Dec Output Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
81 enc.blk.0.attn_k.weight Enc Block 0 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
82 enc.blk.0.attn_o.weight Enc Block 0 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
83 enc.blk.0.attn_q.weight Enc Block 0 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
84 enc.blk.0.attn_rel_b.weight Enc Block 0 Attn_Rel_B (W) ( 256) 256 8 x 32 x 1 x 1 F16
85 enc.blk.0.attn_v.weight Enc Block 0 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
86 enc.blk.0.attn_norm.weight Enc Block 0 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
87 enc.blk.0.ffn_up.weight Enc Block 0 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
88 enc.blk.0.ffn_down.weight Enc Block 0 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
89 enc.blk.0.ffn_norm.weight Enc Block 0 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
90 enc.blk.1.attn_k.weight Enc Block 1 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
91 enc.blk.1.attn_o.weight Enc Block 1 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
92 enc.blk.1.attn_q.weight Enc Block 1 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
93 enc.blk.1.attn_v.weight Enc Block 1 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
94 enc.blk.1.attn_norm.weight Enc Block 1 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
95 enc.blk.1.ffn_up.weight Enc Block 1 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
96 enc.blk.1.ffn_down.weight Enc Block 1 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
97 enc.blk.1.ffn_norm.weight Enc Block 1 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
98 enc.blk.2.attn_k.weight Enc Block 2 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
99 enc.blk.2.attn_o.weight Enc Block 2 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
100 enc.blk.2.attn_q.weight Enc Block 2 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
101 enc.blk.2.attn_v.weight Enc Block 2 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
102 enc.blk.2.attn_norm.weight Enc Block 2 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
103 enc.blk.2.ffn_up.weight Enc Block 2 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
104 enc.blk.2.ffn_down.weight Enc Block 2 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
105 enc.blk.2.ffn_norm.weight Enc Block 2 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
106 enc.blk.3.attn_k.weight Enc Block 3 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
107 enc.blk.3.attn_o.weight Enc Block 3 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
108 enc.blk.3.attn_q.weight Enc Block 3 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
109 enc.blk.3.attn_v.weight Enc Block 3 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
110 enc.blk.3.attn_norm.weight Enc Block 3 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
111 enc.blk.3.ffn_up.weight Enc Block 3 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
112 enc.blk.3.ffn_down.weight Enc Block 3 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
113 enc.blk.3.ffn_norm.weight Enc Block 3 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
114 enc.blk.4.attn_k.weight Enc Block 4 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
115 enc.blk.4.attn_o.weight Enc Block 4 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
116 enc.blk.4.attn_q.weight Enc Block 4 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
117 enc.blk.4.attn_v.weight Enc Block 4 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
118 enc.blk.4.attn_norm.weight Enc Block 4 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
119 enc.blk.4.ffn_up.weight Enc Block 4 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
120 enc.blk.4.ffn_down.weight Enc Block 4 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
121 enc.blk.4.ffn_norm.weight Enc Block 4 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
122 enc.blk.5.attn_k.weight Enc Block 5 Attention Key (W) (~262K) 262144 512 x 512 x 1 x 1 F16
123 enc.blk.5.attn_o.weight Enc Block 5 Attn_O (W) (~262K) 262144 512 x 512 x 1 x 1 F16
124 enc.blk.5.attn_q.weight Enc Block 5 Attention Query (W) (~262K) 262144 512 x 512 x 1 x 1 F16
125 enc.blk.5.attn_v.weight Enc Block 5 Attention Value (W) (~262K) 262144 512 x 512 x 1 x 1 F16
126 enc.blk.5.attn_norm.weight Enc Block 5 Attention Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
127 enc.blk.5.ffn_up.weight Enc Block 5 Feed-Forward Network "Up" (W) ( ~1M) 1048576 512 x 2048 x 1 x 1 F16
128 enc.blk.5.ffn_down.weight Enc Block 5 Feed-Forward Network "Down" (W) ( ~1M) 1048576 2048 x 512 x 1 x 1 F16
129 enc.blk.5.ffn_norm.weight Enc Block 5 Feed-Forward Network Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
130 enc.output_norm.weight Enc Output Normalization (W) ( 512) 512 512 x 1 x 1 x 1 F32
131 token_embd.weight Token Embedding (W) ( ~16M) 16449536 512 x 32128 x 1 x 1 F16

Copy link
Collaborator

@compilade compilade Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the markdown output of gguf-dump.py, there's currently a special case for tensor names which don't start with blk (ref: #7853 (comment), it seemed reasonable at the time), and it puts them all in the same section (so that token_embd.weight is in the same section as output.weight). If you try it on a non-T5 model (e.g. tinyllama or something), you'll notice that there are sections for each layer number.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in #8090

MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
}

MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
Expand Down Expand Up @@ -808,6 +870,38 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
],
MODEL_ARCH.T5: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.DEC_ATTN_NORM,
MODEL_TENSOR.DEC_ATTN_Q,
MODEL_TENSOR.DEC_ATTN_K,
MODEL_TENSOR.DEC_ATTN_V,
MODEL_TENSOR.DEC_ATTN_OUT,
MODEL_TENSOR.DEC_ATTN_REL_B,
MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
MODEL_TENSOR.DEC_CROSS_ATTN_Q,
MODEL_TENSOR.DEC_CROSS_ATTN_K,
MODEL_TENSOR.DEC_CROSS_ATTN_V,
MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
MODEL_TENSOR.DEC_FFN_NORM,
MODEL_TENSOR.DEC_FFN_GATE,
MODEL_TENSOR.DEC_FFN_DOWN,
MODEL_TENSOR.DEC_FFN_UP,
MODEL_TENSOR.DEC_OUTPUT_NORM,
MODEL_TENSOR.ENC_ATTN_NORM,
MODEL_TENSOR.ENC_ATTN_Q,
MODEL_TENSOR.ENC_ATTN_K,
MODEL_TENSOR.ENC_ATTN_V,
MODEL_TENSOR.ENC_ATTN_OUT,
MODEL_TENSOR.ENC_ATTN_REL_B,
MODEL_TENSOR.ENC_FFN_NORM,
MODEL_TENSOR.ENC_FFN_GATE,
MODEL_TENSOR.ENC_FFN_DOWN,
MODEL_TENSOR.ENC_FFN_UP,
MODEL_TENSOR.ENC_OUTPUT_NORM,
],
# TODO
}

Expand Down
21 changes: 18 additions & 3 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,9 @@ def add_expert_shared_feed_forward_length(self, length: int) -> None:
def add_parallel_residual(self, use: bool) -> None:
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)

def add_decoder_start_token_id(self, id: int) -> None:
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)

def add_head_count(self, count: int) -> None:
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)

Expand Down Expand Up @@ -448,6 +451,9 @@ def add_q_lora_rank(self, length: int) -> None:
def add_kv_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)

def add_relative_attn_buckets_count(self, value: int) -> None:
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)

def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)

Expand Down Expand Up @@ -538,6 +544,12 @@ def add_add_eos_token(self, value: bool) -> None:
def add_add_space_prefix(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)

def add_remove_extra_whitespaces(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)

def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)

def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
if not isinstance(value, str):
template_default = None
Expand Down Expand Up @@ -599,9 +611,12 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
kv_data += self._pack("Q", len(encoded_val))
kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
if isinstance(val, bytes):
ltype = GGUFValueType.UINT8
else:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
kv_data += self._pack("I", ltype)
kv_data += self._pack("Q", len(val))
for item in val:
Expand Down
Loading
Loading