pfn.py

import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AlbertTokenizer, AlbertModel, AutoModelForPreTraining 
from transformers import CanineTokenizer, CanineModel   # Comment in when Environment 1 is used.

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Comment in when Environment 2 is used. (the next 3 lines)
sys.path.append('../character-bert/')
from utils.character_cnn import CharacterIndexer
from modeling.character_bert import CharacterBertModel
####

def cumsoftmax(x):
    return torch.cumsum(F.softmax(x, -1), dim=-1)


class LinearDropConnect(nn.Linear):
    def __init__(self, in_features, out_features, bias=True, dropout=0.):
        super(LinearDropConnect, self).__init__(
            in_features=in_features,
            out_features=out_features,
            bias=bias
        )
        self.dropout = dropout

    def sample_mask(self):
        if self.dropout == 0.:
            self._weight = self.weight
        else:
            mask = self.weight.new_empty(
                self.weight.size(),
                dtype=torch.bool
            )
            mask.bernoulli_(self.dropout)
            self._weight = self.weight.masked_fill(mask, 0.)

    def forward(self, input, sample_mask=False):
        if self.training:
            if sample_mask:
                self.sample_mask()
            return F.linear(input, self._weight, self.bias)
        else:
            return F.linear(input, self.weight * (1 - self.dropout),
                            self.bias)


class pfn_unit(nn.Module):
    def __init__(self, args, input_size):
        super(pfn_unit, self).__init__()
        self.args = args

        self.hidden_transform = LinearDropConnect(args.hidden_size, 5 * args.hidden_size, bias=True,
                                                  dropout=args.dropconnect)
        self.input_transform = nn.Linear(input_size, 5 * args.hidden_size, bias=True)

        self.transform = nn.Linear(args.hidden_size * 3, args.hidden_size)
        self.drop_weight_modules = [self.hidden_transform]

    def sample_masks(self):
        for m in self.drop_weight_modules:
            m.sample_mask()

    def forward(self, x, hidden):
        h_in, c_in = hidden

        gates = self.input_transform(x) + self.hidden_transform(h_in)
        c, eg_cin, rg_cin, eg_c, rg_c = gates[:, :].chunk(5, 1)

        eg_cin = 1 - cumsoftmax(eg_cin)
        rg_cin = cumsoftmax(rg_cin)

        eg_c = 1 - cumsoftmax(eg_c)
        rg_c = cumsoftmax(rg_c)

        c = torch.tanh(c)

        overlap_c = rg_c * eg_c
        upper_c = rg_c - overlap_c
        downer_c = eg_c - overlap_c

        overlap_cin = rg_cin * eg_cin
        upper_cin = rg_cin - overlap_cin
        downer_cin = eg_cin - overlap_cin

        share = overlap_cin * c_in + overlap_c * c

        c_re = upper_cin * c_in + upper_c * c + share
        c_ner = downer_cin * c_in + downer_c * c + share
        c_share = share

        h_re = torch.tanh(c_re)
        h_ner = torch.tanh(c_ner)
        h_share = torch.tanh(c_share)

        c_out = torch.cat((c_re, c_ner, c_share), dim=-1)
        c_out = self.transform(c_out)
        h_out = torch.tanh(c_out)

        return (h_out, c_out), (h_ner, h_re, h_share)


class encoder(nn.Module):
    def __init__(self, args, input_size):
        super(encoder, self).__init__()
        self.args = args
        self.unit = pfn_unit(args, input_size)

    def hidden_init(self, batch_size):
        h0 = torch.zeros(batch_size, self.args.hidden_size).requires_grad_(False).to(device)
        c0 = torch.zeros(batch_size, self.args.hidden_size).requires_grad_(False).to(device)
        return (h0, c0)

    def forward(self, x):
        seq_len = x.size(0)
        batch_size = x.size(1)
        h_ner, h_re, h_share = [], [], []
        hidden = self.hidden_init(batch_size)

        if self.training:
            self.unit.sample_masks()

        for t in range(seq_len):
            hidden, h_task = self.unit(x[t, :, :], hidden)
            h_ner.append(h_task[0])
            h_re.append(h_task[1])
            h_share.append(h_task[2])

        h_ner = torch.stack(h_ner, dim=0)
        h_re = torch.stack(h_re, dim=0)
        h_share = torch.stack(h_share, dim=0)

        return h_ner, h_re, h_share


class ner_unit(nn.Module):
    def __init__(self, args, ner2idx):
        super(ner_unit, self).__init__()
        self.hidden_size = args.hidden_size
        self.ner2idx = ner2idx

        self.hid2hid = nn.Linear(self.hidden_size * 3, self.hidden_size)
        self.hid2tag = nn.Linear(self.hidden_size, len(ner2idx))

        self.elu = nn.ELU()
        self.n = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.ln = nn.LayerNorm(self.hidden_size)

        self.dropout = nn.Dropout(args.dropout)

    def forward(self, h_ner, h_share, mask):
        length, batch_size, _ = h_ner.size()

        h_global = torch.cat((h_share, h_ner), dim=-1)
        h_global = torch.tanh(self.n(h_global))

        h_global = torch.max(h_global, dim=0)[0]
        h_global = h_global.unsqueeze(0).repeat(h_ner.size(0), 1, 1)
        h_global = h_global.unsqueeze(0).repeat(h_ner.size(0), 1, 1, 1)

        st = h_ner.unsqueeze(1).repeat(1, length, 1, 1)
        en = h_ner.unsqueeze(0).repeat(length, 1, 1, 1)

        ner = torch.cat((st, en, h_global), dim=-1)

        ner = self.ln(self.hid2hid(ner))
        ner = self.elu(self.dropout(ner))
        ner = torch.sigmoid(self.hid2tag(ner))

        diagonal_mask = torch.triu(torch.ones(batch_size, length, length)).to(device)
        diagonal_mask = diagonal_mask.permute(1, 2, 0)

        mask_s = mask.unsqueeze(1).repeat(1, length, 1)
        mask_e = mask.unsqueeze(0).repeat(length, 1, 1)

        mask_ner = mask_s * mask_e
        mask = diagonal_mask * mask_ner
        mask = mask.unsqueeze(-1).repeat(1, 1, 1, len(self.ner2idx))

        ner = ner * mask

        return ner


class re_unit(nn.Module):
    def __init__(self, args, re2idx):
        super(re_unit, self).__init__()
        self.hidden_size = args.hidden_size
        self.relation_size = len(re2idx)
        self.re2idx = re2idx

        self.hid2hid = nn.Linear(self.hidden_size * 3, self.hidden_size)
        self.hid2rel = nn.Linear(self.hidden_size, self.relation_size)
        self.elu = nn.ELU()

        self.r = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.ln = nn.LayerNorm(self.hidden_size)

        self.dropout = nn.Dropout(args.dropout)

    def forward(self, h_re, h_share, mask):
        length, batch_size, _ = h_re.size()

        h_global = torch.cat((h_share, h_re), dim=-1)
        h_global = torch.tanh(self.r(h_global))

        h_global = torch.max(h_global, dim=0)[0]
        h_global = h_global.unsqueeze(0).repeat(length, 1, 1)
        h_global = h_global.unsqueeze(0).repeat(length, 1, 1, 1)

        r1 = h_re.unsqueeze(1).repeat(1, length, 1, 1)
        r2 = h_re.unsqueeze(0).repeat(length, 1, 1, 1)

        re = torch.cat((r1, r2, h_global), dim=-1)

        re = self.ln(self.hid2hid(re))
        re = self.elu(self.dropout(re))
        re = torch.sigmoid(self.hid2rel(re))

        mask = mask.unsqueeze(-1).repeat(1, 1, self.relation_size)
        mask_e1 = mask.unsqueeze(1).repeat(1, length, 1, 1)
        mask_e2 = mask.unsqueeze(0).repeat(length, 1, 1, 1)
        mask = mask_e1 * mask_e2

        re = re * mask

        return re


class PFN(nn.Module):
    def __init__(self, args, input_size, ner2idx, rel2idx):
        super(PFN, self).__init__()
        self.args = args
        self.feature_extractor = encoder(args, input_size)

        self.ner = ner_unit(args, ner2idx)
        self.re = re_unit(args, rel2idx)
        self.dropout = nn.Dropout(args.dropout)

        if args.embed_mode == 'albert':
            self.tokenizer = AlbertTokenizer.from_pretrained("albert-xxlarge-v1")
            self.bert = AlbertModel.from_pretrained("albert-xxlarge-v1")
        elif args.embed_mode == 'bert_cased':
            self.tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
            self.bert = AutoModel.from_pretrained("bert-base-cased")
        elif args.embed_mode == 'biobert':
            self.tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
            self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        elif args.embed_mode == 'canine_c':
            self.tokenizer = CanineTokenizer.from_pretrained('google/canine-c')
            self.bert = CanineModel.from_pretrained('google/canine-c')
        elif args.embed_mode == 'canine_s':
            self.tokenizer = CanineTokenizer.from_pretrained('google/canine-s') # Comment in when Environment 1 is used.
            self.bert = CanineModel.from_pretrained('google/canine-s') # Comment in when Environment 1 is used.
        ### Comment in when Environment 2 is used. (the next 6 lines)
        elif args.embed_mode == 'characterBERT':
            self.indexer = CharacterIndexer()
            if args.data == 'ADE':
                self.bert = CharacterBertModel.from_pretrained(
                    '../character-bert/pretrained-models/medical_character_bert/').to(device)
            else:
                self.bert = CharacterBertModel.from_pretrained(
                    '../character-bert/pretrained-models/general_character_bert/').to(device)
        ###

        # Longest sentence (precalculated):
        #       - ADE: 97 (+2 for the special tokens)
        #       - CONLL04: 118 (+2 for the special tokens)
        if args.data == 'ADE':
            self.max_sentence_length = 99
            self.padding_max_length = 170
        elif args.data == 'CONLL04':
            self.max_sentence_length = 120
            self.padding_max_length = 200


    def forward(self, x, mask, attention_masks_charBert, word_to_bep, emb):
        if self.args.mode == 'pretrained_frozen_emb':
            x = torch.stack(x, 0).to(device)
            x = x.transpose(0, 1)
            if self.training:
                x = self.dropout(x)

            h_ner, h_re, h_share = self.feature_extractor(x)

            ner_score = self.ner(h_ner, h_share, mask)
            re_core = self.re(h_re, h_share, mask)
            return ner_score, re_core
        elif self.args.mode == 'e2e_training':
            x = self.tokenizer(x, return_tensors="pt",
                               padding='longest',
                               is_split_into_words=True).to(device)
            x = self.bert(**x)[0]
            x = x.transpose(0, 1)
            if self.training:
                x = self.dropout(x)

            h_ner, h_re, h_share = self.feature_extractor(x)

            ner_score = self.ner(h_ner, h_share, mask)
            re_core = self.re(h_re, h_share, mask)
            return ner_score, re_core
        elif self.args.mode == 'e2e_training_word_level':
            if self.args.embed_mode == 'characterBERT':
                # Convert token sequence into character indices
                batch_ids = self.indexer.as_padded_tensor(x).to(device)
                # Stack attention mask
                attention_masks_charBert_tensor = torch.stack(attention_masks_charBert, 0).to(device)
                # attention_masks_charBert_tensor.squeeze_()

                out_text_encoder = self.bert(batch_ids, attention_masks_charBert_tensor)[0]
                out_text_encoder = out_text_encoder.transpose(0, 1)

                if self.training:
                    out_text_encoder = self.dropout(out_text_encoder)

                h_ner, h_re, h_share = self.feature_extractor(out_text_encoder)

                ner_score = self.ner(h_ner, h_share, mask)
                re_core = self.re(h_re, h_share, mask)
                return ner_score, re_core
            elif self.args.embed_mode.split('_')[0] == 'canine':
                #x = self.tokenizer(x, padding="max_length",
                #                   max_length = self.padding_max_length,
                #                   return_tensors="pt").to(device)
                x = self.tokenizer(x, padding="longest",
                                   return_tensors="pt").to(device)
                x = self.bert(**x)[0]
                x_tokens = []
                for i, w_r in enumerate(word_to_bep):
                    # Append 'CLS' token representation
                    tmp_x_tokens = [x[i][0]]
                    for r in w_r:
                        emb_sel = x[i][r]
                        if self.args.word_pieces_aggregation == 'avg':
                            tmp_x_tokens.append(torch.mean(emb_sel, dim=0))
                        elif self.args.word_pieces_aggregation == 'sum':
                            tmp_x_tokens.append(torch.sum(emb_sel, dim=0))
                    # Add the embeddings for '[SEP]' token
                    tmp_x_tokens.append(x[i][w_r[-1][-1] + 1])
                    # Add artificially padded tokens
                    #tmp_x_tokens.extend(x[i][w_r[-1][-1] + 2:self.max_sentence_length + w_r[-1][-1] + 2 - len(tmp_x_tokens)])
                    tmp_x_tokens.extend(torch.zeros(self.max_sentence_length - len(tmp_x_tokens), 768).to(device))

                    x_tokens.append(torch.stack(tmp_x_tokens, 0).squeeze_())

                x_tokens_tensor = torch.stack(x_tokens, 0).squeeze_()
                # Corner case: only 1 element in the batch
                if len(x_tokens_tensor.shape) == 2:
                    x_tokens_tensor.unsqueeze_(0)

                x_tokens_tensor = x_tokens_tensor.transpose(0, 1)

                if self.training:
                    x_tokens_tensor = self.dropout(x_tokens_tensor)

                h_ner, h_re, h_share = self.feature_extractor(x_tokens_tensor)

                ner_score = self.ner(h_ner, h_share, mask)
                re_core = self.re(h_re, h_share, mask)
                return ner_score, re_core
            else:
                x = self.tokenizer(x, return_tensors="pt",
                                   padding='max_length',
                                   max_length=self.padding_max_length,
                                   is_split_into_words=True).to(device)
                x = self.bert(**x)[0]
                x_tokens = []
                for i, w_b in enumerate(word_to_bep):
                    # Append 'CLS' token representation
                    tmp_x_tokens = [x[i][0]]
                    for w_k in w_b.keys():
                        start = w_b[w_k][0] + 1
                        end = w_b[w_k][1] + 1
                        emb_sel = x[i][start:end + 1]
                        if self.args.word_pieces_aggregation == 'avg':
                            tmp_x_tokens.append(torch.mean(emb_sel, dim=0))
                        elif self.args.word_pieces_aggregation == 'sum':
                            tmp_x_tokens.append(torch.sum(emb_sel, dim=0))
                    # Add the embeddings for '[SEP]' token
                    tmp_x_tokens.append(x[i][end + 1])
                    # Add padded tokens
                    tmp_x_tokens.extend(x[i][end + 2:self.max_sentence_length + end + 2 - len(tmp_x_tokens)])

                    x_tokens.append(torch.stack(tmp_x_tokens, 0).squeeze_())

                x_tokens_tensor = torch.stack(x_tokens, 0).squeeze_()
                # Corner case: only 1 element in the batch
                if len(x_tokens_tensor.shape) == 2:
                    x_tokens_tensor.unsqueeze_(0)

                x_tokens_tensor = x_tokens_tensor.transpose(0, 1)

                if self.training:
                    x_tokens_tensor = self.dropout(x_tokens_tensor)

                h_ner, h_re, h_share = self.feature_extractor(x_tokens_tensor)

                ner_score = self.ner(h_ner, h_share, mask)
                re_core = self.re(h_re, h_share, mask)
                return ner_score, re_core