-
Notifications
You must be signed in to change notification settings - Fork 28
/
create_bert.py
105 lines (86 loc) · 3.81 KB
/
create_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Copyright 2022 MosaicML Examples authors
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import os
import sys
from typing import Optional
# Add src folder root to path to allow us to use relative imports regardless of what directory the script is run from
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
import bert_layers as bert_layers_module
import configuration_bert as configuration_bert_module
import transformers
from composer.models.huggingface import HuggingFaceModel
all = ['create_bert_mlm']
def create_bert_mlm(pretrained_model_name: str = 'bert-base-uncased',
model_config: Optional[dict] = None,
tokenizer_name: Optional[str] = None):
"""BERT masked language model based on |:hugging_face:| Transformers.
For more information, see
`Transformers. <https://huggingface.co/transformers/>`_ and Mosaic's BERT repo <https://github.com/mosaicml/examples/tree/main/examples/benchmarks/bert>
Args:
pretrained_model_name (str): Name of the Hugging Face model to
instantiate. This will determine the default model configuration.
Default: ``bert-base-uncased``.
model_config (dict): A dictionary of user-specified configurations to
update/add to the default model configuration.
tokenizer_name (str, optional): Tokenizer name used to preprocess the
dataset and validate the models inputs.
gradient_checkpointing (bool, optional): Use gradient checkpointing.
Default: ``False``.
pretrained_checkpoint (str, optional): The pretrained checkpoint to
initialize the model weights. If provided, the state dictionary
stored at `pretrained_checkpoint` will be loaded into the model
after initialization. Default: ``None``.
.. code-block::
{
"_name_or_path": "bert-base-uncased",
"alibi_starting_size": 512,
"architectures": ["BertForMaskedLM"],
"attention_probs_dropout_prob": 0.0,
"classifier_dropout": null,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.16.0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522
}
"""
if not model_config:
model_config = {}
if not pretrained_model_name:
pretrained_model_name = 'bert-base-uncased'
config = configuration_bert_module.BertConfig.from_pretrained(
pretrained_model_name, **model_config)
for key, value in model_config.items():
config.update({f'{key}': value})
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
model = bert_layers_module.BertForMaskedLM(config)
# setup the tokenizer
if tokenizer_name:
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
else:
tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained_model_name)
hf_model = HuggingFaceModel(model=model,
tokenizer=tokenizer,
use_logits=True)
# Padding for divisibility by 8
# We have to do it again here because wrapping by HuggingFaceModel changes it
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
hf_model.model.resize_token_embeddings(config.vocab_size)
return hf_model