From 50620f96456713fb5aa4ad3a61630338e12913fe Mon Sep 17 00:00:00 2001 From: nitin Date: Tue, 12 Jul 2022 10:45:22 +0800 Subject: [PATCH 01/32] mtl init --- hebpipe/lib/allennlp/__init__.py | 0 .../lib/allennlp/conditional_random_field.py | 639 ++++++++++++++++++ hebpipe/lib/allennlp/time_distributed.py | 79 +++ .../lib/multitask_sentsplitter_postagger.py | 213 ++++++ 4 files changed, 931 insertions(+) create mode 100644 hebpipe/lib/allennlp/__init__.py create mode 100644 hebpipe/lib/allennlp/conditional_random_field.py create mode 100644 hebpipe/lib/allennlp/time_distributed.py create mode 100644 hebpipe/lib/multitask_sentsplitter_postagger.py diff --git a/hebpipe/lib/allennlp/__init__.py b/hebpipe/lib/allennlp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hebpipe/lib/allennlp/conditional_random_field.py b/hebpipe/lib/allennlp/conditional_random_field.py new file mode 100644 index 0000000..2982358 --- /dev/null +++ b/hebpipe/lib/allennlp/conditional_random_field.py @@ -0,0 +1,639 @@ +""" +Conditional random field +""" +import logging +import math +import torch + +from typing import List, Tuple, Dict, Union,Optional,Any, TypeVar + +logger = logging.getLogger(__name__) + +T = TypeVar("T") +StateDictType = Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"] + +VITERBI_DECODING = Tuple[List[int], float] # a list of tags, and a viterbi score + + +class ConfigurationError(Exception): + + """ + The exception raised by any AllenNLP object when it's misconfigured + (e.g. missing properties, invalid properties, unknown properties). + """ + + def __reduce__(self) -> Union[str, Tuple[Any, ...]]: + return type(self), (self.message,) + + def __init__(self, message: str): + super().__init__() + self.message = message + + def __str__(self): + return self.message + +def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> torch.Tensor: + """ + A numerically stable computation of logsumexp. This is mathematically equivalent to + `tensor.exp().sum(dim, keep=keepdim).log()`. This function is typically used for summing log + probabilities. + + # Parameters + + tensor : `torch.FloatTensor`, required. + A tensor of arbitrary size. + dim : `int`, optional (default = `-1`) + The dimension of the tensor to apply the logsumexp to. + keepdim: `bool`, optional (default = `False`) + Whether to retain a dimension of size one at the dimension we reduce over. + """ + max_score, _ = tensor.max(dim, keepdim=keepdim) + if keepdim: + stable_vec = tensor - max_score + else: + stable_vec = tensor - max_score.unsqueeze(dim) + return max_score + (stable_vec.exp().sum(dim, keepdim=keepdim)).log() + + +def viterbi_decode( + tag_sequence: torch.Tensor, + transition_matrix: torch.Tensor, + tag_observations: Optional[List[int]] = None, + allowed_start_transitions: torch.Tensor = None, + allowed_end_transitions: torch.Tensor = None, + top_k: int = None, +): + """ + Perform Viterbi decoding in log space over a sequence given a transition matrix + specifying pairwise (transition) potentials between tags and a matrix of shape + (sequence_length, num_tags) specifying unary potentials for possible tags per + timestep. + + # Parameters + + tag_sequence : `torch.Tensor`, required. + A tensor of shape (sequence_length, num_tags) representing scores for + a set of tags over a given sequence. + transition_matrix : `torch.Tensor`, required. + A tensor of shape (num_tags, num_tags) representing the binary potentials + for transitioning between a given pair of tags. + tag_observations : `Optional[List[int]]`, optional, (default = `None`) + A list of length `sequence_length` containing the class ids of observed + elements in the sequence, with unobserved elements being set to -1. Note that + it is possible to provide evidence which results in degenerate labelings if + the sequences of tags you provide as evidence cannot transition between each + other, or those transitions are extremely unlikely. In this situation we log a + warning, but the responsibility for providing self-consistent evidence ultimately + lies with the user. + allowed_start_transitions : `torch.Tensor`, optional, (default = `None`) + An optional tensor of shape (num_tags,) describing which tags the START token + may transition *to*. If provided, additional transition constraints will be used for + determining the start element of the sequence. + allowed_end_transitions : `torch.Tensor`, optional, (default = `None`) + An optional tensor of shape (num_tags,) describing which tags may transition *to* the + end tag. If provided, additional transition constraints will be used for determining + the end element of the sequence. + top_k : `int`, optional, (default = `None`) + Optional integer specifying how many of the top paths to return. For top_k>=1, returns + a tuple of two lists: top_k_paths, top_k_scores, For top_k==None, returns a flattened + tuple with just the top path and its score (not in lists, for backwards compatibility). + + # Returns + + viterbi_path : `List[int]` + The tag indices of the maximum likelihood tag sequence. + viterbi_score : `torch.Tensor` + The score of the viterbi path. + """ + if top_k is None: + top_k = 1 + flatten_output = True + elif top_k >= 1: + flatten_output = False + else: + raise ValueError(f"top_k must be either None or an integer >=1. Instead received {top_k}") + + sequence_length, num_tags = list(tag_sequence.size()) + + has_start_end_restrictions = ( + allowed_end_transitions is not None or allowed_start_transitions is not None + ) + + if has_start_end_restrictions: + + if allowed_end_transitions is None: + allowed_end_transitions = torch.zeros(num_tags) + if allowed_start_transitions is None: + allowed_start_transitions = torch.zeros(num_tags) + + num_tags = num_tags + 2 + new_transition_matrix = torch.zeros(num_tags, num_tags) + new_transition_matrix[:-2, :-2] = transition_matrix + + # Start and end transitions are fully defined, but cannot transition between each other. + + allowed_start_transitions = torch.cat( + [allowed_start_transitions, torch.tensor([-math.inf, -math.inf])] + ) + allowed_end_transitions = torch.cat( + [allowed_end_transitions, torch.tensor([-math.inf, -math.inf])] + ) + + # First define how we may transition FROM the start and end tags. + new_transition_matrix[-2, :] = allowed_start_transitions + # We cannot transition from the end tag to any tag. + new_transition_matrix[-1, :] = -math.inf + + new_transition_matrix[:, -1] = allowed_end_transitions + # We cannot transition to the start tag from any tag. + new_transition_matrix[:, -2] = -math.inf + + transition_matrix = new_transition_matrix + + if tag_observations: + if len(tag_observations) != sequence_length: + raise ConfigurationError( + "Observations were provided, but they were not the same length " + "as the sequence. Found sequence of length: {} and evidence: {}".format( + sequence_length, tag_observations + ) + ) + else: + tag_observations = [-1 for _ in range(sequence_length)] + + if has_start_end_restrictions: + tag_observations = [num_tags - 2] + tag_observations + [num_tags - 1] + zero_sentinel = torch.zeros(1, num_tags) + extra_tags_sentinel = torch.ones(sequence_length, 2) * -math.inf + tag_sequence = torch.cat([tag_sequence, extra_tags_sentinel], -1) + tag_sequence = torch.cat([zero_sentinel, tag_sequence, zero_sentinel], 0) + sequence_length = tag_sequence.size(0) + + path_scores = [] + path_indices = [] + + if tag_observations[0] != -1: + one_hot = torch.zeros(num_tags) + one_hot[tag_observations[0]] = 100000.0 + path_scores.append(one_hot.unsqueeze(0)) + else: + path_scores.append(tag_sequence[0, :].unsqueeze(0)) + + # Evaluate the scores for all possible paths. + for timestep in range(1, sequence_length): + # Add pairwise potentials to current scores. + summed_potentials = path_scores[timestep - 1].unsqueeze(2) + transition_matrix + summed_potentials = summed_potentials.view(-1, num_tags) + + # Best pairwise potential path score from the previous timestep. + max_k = min(summed_potentials.size()[0], top_k) + scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) + + # If we have an observation for this timestep, use it + # instead of the distribution over tags. + observation = tag_observations[timestep] + # Warn the user if they have passed + # invalid/extremely unlikely evidence. + if tag_observations[timestep - 1] != -1 and observation != -1: + if transition_matrix[tag_observations[timestep - 1], observation] < -10000: + logger.warning( + "The pairwise potential between tags you have passed as " + "observations is extremely unlikely. Double check your evidence " + "or transition potentials!" + ) + if observation != -1: + one_hot = torch.zeros(num_tags) + one_hot[observation] = 100000.0 + path_scores.append(one_hot.unsqueeze(0)) + else: + path_scores.append(tag_sequence[timestep, :] + scores) + path_indices.append(paths.squeeze()) + + # Construct the most likely sequence backwards. + path_scores_v = path_scores[-1].view(-1) + max_k = min(path_scores_v.size()[0], top_k) + viterbi_scores, best_paths = torch.topk(path_scores_v, k=max_k, dim=0) + viterbi_paths = [] + for i in range(max_k): + viterbi_path = [best_paths[i]] + for backward_timestep in reversed(path_indices): + viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]])) + # Reverse the backward path. + viterbi_path.reverse() + + if has_start_end_restrictions: + viterbi_path = viterbi_path[1:-1] + + # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. + viterbi_path = [j % num_tags for j in viterbi_path] + viterbi_paths.append(viterbi_path) + + if flatten_output: + return viterbi_paths[0], viterbi_scores[0] + + return viterbi_paths, viterbi_scores + + +def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]: + """ + Given labels and a constraint type, returns the allowed transitions. It will + additionally include transitions for the start and end states, which are used + by the conditional random field. + # Parameters + constraint_type : `str`, required + Indicates which constraint to apply. Current choices are + "BIO", "IOB1", "BIOUL", and "BMES". + labels : `Dict[int, str]`, required + A mapping {label_id -> label}. Most commonly this would be the value from + Vocabulary.get_index_to_token_vocabulary() + # Returns + `List[Tuple[int, int]]` + The allowed transitions (from_label_id, to_label_id). + """ + num_labels = len(labels) + start_tag = num_labels + end_tag = num_labels + 1 + labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")] + + allowed = [] + for from_label_index, from_label in labels_with_boundaries: + if from_label in ("START", "END"): + from_tag = from_label + from_entity = "" + else: + from_tag = from_label[0] + from_entity = from_label[1:] + for to_label_index, to_label in labels_with_boundaries: + if to_label in ("START", "END"): + to_tag = to_label + to_entity = "" + else: + to_tag = to_label[0] + to_entity = to_label[1:] + if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity): + allowed.append((from_label_index, to_label_index)) + return allowed + + + +def is_transition_allowed( + constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str +): + """ + Given a constraint type and strings `from_tag` and `to_tag` that + represent the origin and destination of the transition, return whether + the transition is allowed under the given constraint type. + # Parameters + constraint_type : `str`, required + Indicates which constraint to apply. Current choices are + "BIO", "IOB1", "BIOUL", and "BMES". + from_tag : `str`, required + The tag that the transition originates from. For example, if the + label is `I-PER`, the `from_tag` is `I`. + from_entity : `str`, required + The entity corresponding to the `from_tag`. For example, if the + label is `I-PER`, the `from_entity` is `PER`. + to_tag : `str`, required + The tag that the transition leads to. For example, if the + label is `I-PER`, the `to_tag` is `I`. + to_entity : `str`, required + The entity corresponding to the `to_tag`. For example, if the + label is `I-PER`, the `to_entity` is `PER`. + # Returns + `bool` + Whether the transition is allowed under the given `constraint_type`. + """ + + if to_tag == "START" or from_tag == "END": + # Cannot transition into START or from END + return False + + if constraint_type == "BIOUL": + if from_tag == "START": + return to_tag in ("O", "B", "U") + if to_tag == "END": + return from_tag in ("O", "L", "U") + return any( + [ + # O can transition to O, B-* or U-* + # L-x can transition to O, B-*, or U-* + # U-x can transition to O, B-*, or U-* + from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"), + # B-x can only transition to I-x or L-x + # I-x can only transition to I-x or L-x + from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity, + ] + ) + elif constraint_type == "BIO": + if from_tag == "START": + return to_tag in ("O", "B") + if to_tag == "END": + return from_tag in ("O", "B", "I") + return any( + [ + # Can always transition to O or B-x + to_tag in ("O", "B"), + # Can only transition to I-x from B-x or I-x + to_tag == "I" and from_tag in ("B", "I") and from_entity == to_entity, + ] + ) + elif constraint_type == "IOB1": + if from_tag == "START": + return to_tag in ("O", "I") + if to_tag == "END": + return from_tag in ("O", "B", "I") + return any( + [ + # Can always transition to O or I-x + to_tag in ("O", "I"), + # Can only transition to B-x from B-x or I-x, where + # x is the same tag. + to_tag == "B" and from_tag in ("B", "I") and from_entity == to_entity, + ] + ) + elif constraint_type == "BMES": + if from_tag == "START": + return to_tag in ("B", "S") + if to_tag == "END": + return from_tag in ("E", "S") + return any( + [ + # Can only transition to B or S from E or S. + to_tag in ("B", "S") and from_tag in ("E", "S"), + # Can only transition to M-x from B-x, where + # x is the same tag. + to_tag == "M" and from_tag in ("B", "M") and from_entity == to_entity, + # Can only transition to E-x from B-x or M-x, where + # x is the same tag. + to_tag == "E" and from_tag in ("B", "M") and from_entity == to_entity, + ] + ) + else: + raise ConfigurationError(f"Unknown constraint type: {constraint_type}") + + +class ConditionalRandomField(torch.nn.Module): + """ + This module uses the "forward-backward" algorithm to compute + the log-likelihood of its inputs assuming a conditional random field model. + See, e.g. http://www.cs.columbia.edu/~mcollins/fb.pdf + # Parameters + num_tags : `int`, required + The number of tags. + constraints : `List[Tuple[int, int]]`, optional (default = `None`) + An optional list of allowed transitions (from_tag_id, to_tag_id). + These are applied to `viterbi_tags()` but do not affect `forward()`. + These should be derived from `allowed_transitions` so that the + start and end transitions are handled correctly for your tag type. + include_start_end_transitions : `bool`, optional (default = `True`) + Whether to include the start and end transition parameters. + """ + + def __init__( + self, + num_tags: int, + constraints: List[Tuple[int, int]] = None, + include_start_end_transitions: bool = True, + ) -> None: + super().__init__() + self.num_tags = num_tags + + # transitions[i, j] is the logit for transitioning from state i to state j. + self.transitions = torch.nn.Parameter(torch.empty(num_tags, num_tags)) + + # _constraint_mask indicates valid transitions (based on supplied constraints). + # Include special start of sequence (num_tags + 1) and end of sequence tags (num_tags + 2) + if constraints is None: + # All transitions are valid. + constraint_mask = torch.full((num_tags + 2, num_tags + 2), 1.0) + else: + constraint_mask = torch.full((num_tags + 2, num_tags + 2), 0.0) + for i, j in constraints: + constraint_mask[i, j] = 1.0 + + self._constraint_mask = torch.nn.Parameter(constraint_mask, requires_grad=False) + + # Also need logits for transitioning from "start" state and to "end" state. + self.include_start_end_transitions = include_start_end_transitions + if include_start_end_transitions: + self.start_transitions = torch.nn.Parameter(torch.Tensor(num_tags)) + self.end_transitions = torch.nn.Parameter(torch.Tensor(num_tags)) + + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.xavier_normal_(self.transitions) + if self.include_start_end_transitions: + torch.nn.init.normal_(self.start_transitions) + torch.nn.init.normal_(self.end_transitions) + + def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: + """ + Computes the (batch_size,) denominator term for the log-likelihood, which is the + sum of the likelihoods across all possible state sequences. + """ + batch_size, sequence_length, num_tags = logits.size() + + # Transpose batch size and sequence dimensions + mask = mask.transpose(0, 1).contiguous() + logits = logits.transpose(0, 1).contiguous() + + # Initial alpha is the (batch_size, num_tags) tensor of likelihoods combining the + # transitions to the initial states and the logits for the first timestep. + if self.include_start_end_transitions: + alpha = self.start_transitions.view(1, num_tags) + logits[0] + else: + alpha = logits[0] + + # For each i we compute logits for the transitions from timestep i-1 to timestep i. + # We do so in a (batch_size, num_tags, num_tags) tensor where the axes are + # (instance, current_tag, next_tag) + for i in range(1, sequence_length): + # The emit scores are for time i ("next_tag") so we broadcast along the current_tag axis. + emit_scores = logits[i].view(batch_size, 1, num_tags) + # Transition scores are (current_tag, next_tag) so we broadcast along the instance axis. + transition_scores = self.transitions.view(1, num_tags, num_tags) + # Alpha is for the current_tag, so we broadcast along the next_tag axis. + broadcast_alpha = alpha.view(batch_size, num_tags, 1) + + # Add all the scores together and logexp over the current_tag axis. + inner = broadcast_alpha + emit_scores + transition_scores + + # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension + # of `inner`. Otherwise (mask == False) we want to retain the previous alpha. + alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * ( + ~mask[i] + ).view(batch_size, 1) + + # Every sequence needs to end with a transition to the stop_tag. + if self.include_start_end_transitions: + stops = alpha + self.end_transitions.view(1, num_tags) + else: + stops = alpha + + # Finally we log_sum_exp along the num_tags dim, result is (batch_size,) + return logsumexp(stops) + + def _joint_likelihood( + self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor + ) -> torch.Tensor: + """ + Computes the numerator term for the log-likelihood, which is just score(inputs, tags) + """ + batch_size, sequence_length, _ = logits.data.shape + + # Transpose batch size and sequence dimensions: + logits = logits.transpose(0, 1).contiguous() + mask = mask.transpose(0, 1).contiguous() + tags = tags.transpose(0, 1).contiguous() + + # Start with the transition scores from start_tag to the first tag in each input + if self.include_start_end_transitions: + score = self.start_transitions.index_select(0, tags[0]) + else: + score = 0.0 + + # Add up the scores for the observed transitions and all the inputs but the last + for i in range(sequence_length - 1): + # Each is shape (batch_size,) + current_tag, next_tag = tags[i], tags[i + 1] + + # The scores for transitioning from current_tag to next_tag + transition_score = self.transitions[current_tag.view(-1), next_tag.view(-1)] + + # The score for using current_tag + emit_score = logits[i].gather(1, current_tag.view(batch_size, 1)).squeeze(1) + + # Include transition score if next element is unmasked, + # input_score if this element is unmasked. + score = score + transition_score * mask[i + 1] + emit_score * mask[i] + + # Transition from last state to "stop" state. To start with, we need to find the last tag + # for each instance. + last_tag_index = mask.sum(0).long() - 1 + last_tags = tags.gather(0, last_tag_index.view(1, batch_size)).squeeze(0) + + # Compute score of transitioning to `stop_tag` from each "last tag". + if self.include_start_end_transitions: + last_transition_score = self.end_transitions.index_select(0, last_tags) + else: + last_transition_score = 0.0 + + # Add the last input if it's not masked. + last_inputs = logits[-1] # (batch_size, num_tags) + last_input_score = last_inputs.gather(1, last_tags.view(-1, 1)) # (batch_size, 1) + last_input_score = last_input_score.squeeze() # (batch_size,) + + score = score + last_transition_score + last_input_score * mask[-1] + + return score + + def forward( + self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None + ) -> torch.Tensor: + """ + Computes the log likelihood. + """ + + if mask is None: + mask = torch.ones(*tags.size(), dtype=torch.bool, device=inputs.device) + else: + # The code below fails in weird ways if this isn't a bool tensor, so we make sure. + mask = mask.to(torch.bool) + + log_denominator = self._input_likelihood(inputs, mask) + log_numerator = self._joint_likelihood(inputs, tags, mask) + + return torch.sum(log_numerator - log_denominator) + + def viterbi_tags( + self, logits: torch.Tensor, mask: torch.BoolTensor = None, top_k: int = None + ) -> Union[List[VITERBI_DECODING], List[List[VITERBI_DECODING]]]: + """ + Uses viterbi algorithm to find most likely tags for the given inputs. + If constraints are applied, disallows all other transitions. + Returns a list of results, of the same size as the batch (one result per batch member) + Each result is a List of length top_k, containing the top K viterbi decodings + Each decoding is a tuple (tag_sequence, viterbi_score) + For backwards compatibility, if top_k is None, then instead returns a flat list of + tag sequences (the top tag sequence for each batch item). + """ + if mask is None: + mask = torch.ones(*logits.shape[:2], dtype=torch.bool, device=logits.device) + + if top_k is None: + top_k = 1 + flatten_output = True + else: + flatten_output = False + + _, max_seq_length, num_tags = logits.size() + + # Get the tensors out of the variables + logits, mask = logits.data, mask.data + + # Augment transitions matrix with start and end transitions + start_tag = num_tags + end_tag = num_tags + 1 + transitions = torch.full((num_tags + 2, num_tags + 2), -10000.0, device=logits.device) + + # Apply transition constraints + constrained_transitions = self.transitions * self._constraint_mask[ + :num_tags, :num_tags + ] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags]) + transitions[:num_tags, :num_tags] = constrained_transitions.data + + if self.include_start_end_transitions: + transitions[ + start_tag, :num_tags + ] = self.start_transitions.detach() * self._constraint_mask[ + start_tag, :num_tags + ].data + -10000.0 * ( + 1 - self._constraint_mask[start_tag, :num_tags].detach() + ) + transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[ + :num_tags, end_tag + ].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach()) + else: + transitions[start_tag, :num_tags] = -10000.0 * ( + 1 - self._constraint_mask[start_tag, :num_tags].detach() + ) + transitions[:num_tags, end_tag] = -10000.0 * ( + 1 - self._constraint_mask[:num_tags, end_tag].detach() + ) + + best_paths = [] + # Pad the max sequence length by 2 to account for start_tag + end_tag. + tag_sequence = torch.empty(max_seq_length + 2, num_tags + 2, device=logits.device) + + for prediction, prediction_mask in zip(logits, mask): + mask_indices = prediction_mask.nonzero(as_tuple=False).squeeze() + masked_prediction = torch.index_select(prediction, 0, mask_indices) + sequence_length = masked_prediction.shape[0] + + # Start with everything totally unlikely + tag_sequence.fill_(-10000.0) + # At timestep 0 we must have the START_TAG + tag_sequence[0, start_tag] = 0.0 + # At steps 1, ..., sequence_length we just use the incoming prediction + tag_sequence[1 : (sequence_length + 1), :num_tags] = masked_prediction + # And at the last timestep we must have the END_TAG + tag_sequence[sequence_length + 1, end_tag] = 0.0 + + # We pass the tags and the transitions to `viterbi_decode`. + viterbi_paths, viterbi_scores = viterbi_decode( + tag_sequence=tag_sequence[: (sequence_length + 2)], + transition_matrix=transitions, + top_k=top_k, + ) + top_k_paths = [] + for viterbi_path, viterbi_score in zip(viterbi_paths, viterbi_scores): + # Get rid of START and END sentinels and append. + viterbi_path = viterbi_path[1:-1] + top_k_paths.append((viterbi_path, viterbi_score.item())) + best_paths.append(top_k_paths) + + if flatten_output: + return [top_k_paths[0] for top_k_paths in best_paths] + + return best_paths \ No newline at end of file diff --git a/hebpipe/lib/allennlp/time_distributed.py b/hebpipe/lib/allennlp/time_distributed.py new file mode 100644 index 0000000..7a4d86a --- /dev/null +++ b/hebpipe/lib/allennlp/time_distributed.py @@ -0,0 +1,79 @@ +""" +A wrapper that unrolls the second (time) dimension of a tensor +into the first (batch) dimension, applies some other `Module`, +and then rolls the time dimension back up. +""" + +from typing import List + + +import torch + + +class TimeDistributed(torch.nn.Module): + """ + Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes + inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be + `(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back. + Note that while the above gives shapes with `batch_size` first, this `Module` also works if + `batch_size` is second - we always just combine the first two dimensions, then split them. + It also reshapes keyword arguments unless they are not tensors or their name is specified in + the optional `pass_through` iterable. + """ + + def __init__(self, module): + super().__init__() + self._module = module + + def forward(self, *inputs, pass_through: List[str] = None, **kwargs): + + pass_through = pass_through or [] + + reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs] + + # Need some input to then get the batch_size and time_steps. + some_input = None + if inputs: + some_input = inputs[-1] + + reshaped_kwargs = {} + for key, value in kwargs.items(): + if isinstance(value, torch.Tensor) and key not in pass_through: + if some_input is None: + some_input = value + + value = self._reshape_tensor(value) + + reshaped_kwargs[key] = value + + reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs) + + if some_input is None: + raise RuntimeError("No input tensor to time-distribute") + + # Now get the output back into the right shape. + # (batch_size, time_steps, **output_size) + tuple_output = True + if not isinstance(reshaped_outputs, tuple): + tuple_output = False + reshaped_outputs = (reshaped_outputs,) + + outputs = [] + for reshaped_output in reshaped_outputs: + new_size = some_input.size()[:2] + reshaped_output.size()[1:] + outputs.append(reshaped_output.contiguous().view(new_size)) + + if not tuple_output: + outputs = outputs[0] + + return outputs + + @staticmethod + def _reshape_tensor(input_tensor): + input_size = input_tensor.size() + if len(input_size) <= 2: + raise RuntimeError(f"No dimension to distribute: {input_size}") + # Squash batch_size and time_steps into a single axis; result has shape + # (batch_size * time_steps, **input_size). + squashed_shape = [-1] + list(input_size[2:]) + return input_tensor.contiguous().view(*squashed_shape) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py new file mode 100644 index 0000000..7f0ae99 --- /dev/null +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -0,0 +1,213 @@ +import conllu +import torch +import torch.nn as nn + +from flair.embeddings import TransformerWordEmbeddings +from flair.data import Sentence +from lib.allennlp.conditional_random_field import ConditionalRandomField +from lib.allennlp.time_distributed import TimeDistributed + + +class MTLModel(nn.Module): + def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=128): + super(MTLModel,self).__init__() + + self.sbdtagset = ['B-SENT', 'O'] + self.postagset = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'] # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? + + self.sequence_length = 256 + self.batch_size = 5 + + # for CRF + self.START_TAG = "" + self.STOP_TAG = "" + + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + # Flair embeddings do subword pooling! + self.transformerembeddings = TransformerWordEmbeddings(model='onlplab/alephbert-base',batch_size=self.batch_size,pooling_operation='mean').to(self.device) + + # Bi-LSTM Encoder + self.embeddingdim = 768 # based on BERT model + self.rnndim = rnndim + self.rnnnumlayers = rnnnumlayers + self.rnnbidirectional = rnnbidirectional + self.rnndropout = rnndropout + + + if encodertype == 'lstm': + self.encoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.rnndim // 2, + num_layers=self.rnnnumlayers, bidirectional=self.rnnbidirectional, + dropout=self.rnndropout).to(self.device) + elif encodertype == 'gru': + self.encoder = nn.GRU(input_size=self.embeddingdim, hidden_size=self.rnndim // 2, + num_layers=self.rnnnumlayers, bidirectional=self.rnnbidirectional, + dropout=self.rnndropout).to(self.device) + + # Intermediate feedforward layer + self.ffdim = ffdim + self.fflayer = TimeDistributed(nn.Linear(in_features=self.rnndim,out_features=self.ffdim)).to(self.device) + + # Label space for the pos tagger + # TODO: CRF? + self.hidden2postag = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.postagset))).to(self.device) + + # Label space for sent splitter + self.hidden2sbd = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.sbdtagset))).to(self.device) + + # Linear CRF for sent splitter + self.sbd_tag2idx = {'B-SENT':0,'O':1,self.START_TAG:2,self.STOP_TAG:3} # AllenNLP CRF expects start and stop tags to be appended at the end, in that order + self.sbddtransitions = [(0,1),(1,0),(2,0),(2,1),(0,3),(1,3)] + self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx),self.sbddtransitions).to(self.device) + + def init_hidden(self): + """ + Used by RNN-type encoders + """ + if self.rnnbidirectional == True: + numdir = 2 + else: + numdir = 1 + + return (torch.randn(self.rnnnumlayers * numdir, 1, self.rnndim // 2, device=self.device), + torch.randn(self.rnnnumlayers * numdir, 1, self.rnndim // 2, device=self.device)) + + def forward(self,slice): + + """ + slice is a list of tuples of length = seq_len. Each tuple is (token, pos tag, sentence boundary label) + """ + + sents = [' '.join([s.split('\t')[0] for s in sls]) for sls in slice] + + sentences = [] + for sent in sents: + sentences.append(Sentence(sent,use_tokenizer=False)) + + sentences = self.transformerembeddings.embed(sentences) + + embeddings = [] + for sent in sentences: + embedding = [] + for tok in sent: + embedding.append(tok.embedding) + for _ in range(len(sent),self.sequence_length): + embedding.append(torch.zeros(768 * 4)) # for padding + embeddings.append(torch.stack(embedding)) + + embeddings = torch.stack(embeddings) + print ('here') + + +class Tagger(): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=128): + + self.mtlmodel = MTLModel(rnndim,rnnnumlayers,rnnbidirectional,rnndropout,encodertype,ffdim) + + if trainflag == True: + import tensorboard + self.trainingdatafile = '../data/sentsplit_postag_train_gold.tab' + self.devdatafile = '../data/sentsplit_postag_dev_gold.tab' + else: + self.testdatafile = '../data/sentsplit_postag_test_gold.tab' + + + self.trainflag = trainflag + self.trainfile = trainfile + self.devfile = devfile + self.testfile = testfile + + + def train(self): + + def read_file(mode='train'): + + if mode == 'train': + file = self.trainingdatafile + else: + file = self.devdatafile + + dataset = [] + with open(file,'r') as fi: + lines = fi.readlines() + # split into contiguous sequence of seq_len length + for idx in range(0,len(lines),self.mtlmodel.sequence_length): + if idx + self.mtlmodel.sequence_length >= len(lines): + slice = lines[idx:len(lines)] + else: + slice = lines[idx:idx + self.mtlmodel.sequence_length] + + dataset.append(slice) + + return dataset + + trainingdata = read_file() + devdata = read_file(mode='dev') + + + self.mtlmodel(trainingdata[0:5]) + + + + def predict(self): + pass + + def prepare_data_files(self): + """ + Prepares the train and dev data files for training + """ + def write_file(filename,mode='train'): + + if mode == 'dev': + data = devdata + else: + data = traindata + + with open(filename,'w') as tr: + for sent in data: + for i in range(0,len(sent)): # This will disregard the final punct in each sentence. + + if isinstance(sent[i]['id'],tuple): continue + + if i == len(sent) - 2 and (sent[i + 1]['form'] == '.' and sent[i + 1]['upos'] == 'PUNCT'): + tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'B-SENT' + '\n') + elif i == len(sent) - 1 and (sent[i]['form'] != '.' and sent[i]['upos'] != 'PUNCT'): + tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'B-SENT' + '\n') + elif i != len(sent) - 1: + tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'O' + '\n') + + traindata = self.read_conllu() + devdata = self.read_conllu(mode='dev') + + write_file(self.trainingdatafile,mode='train') + write_file(self.devdatafile,mode='dev') + + + def read_conllu(self,mode='train'): + + fields = tuple( + list(conllu.parser.DEFAULT_FIELDS) + ) + + if mode == 'dev': + file = self.devfile + else: + file = self.trainfile + + with open(file, "r", encoding="utf-8") as f: + return conllu.parse(f.read(), fields=fields) + + +def main(): # testing only + + iahltwikitrain = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-train.conllu' + iahltwikidev = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-dev.conllu' + tagger = Tagger(trainflag=True,trainfile=iahltwikitrain,devfile=iahltwikidev) + tagger.prepare_data_files() + tagger.train() + + print ('here') + + +if __name__ == "__main__": + main() \ No newline at end of file From 44c50065d5724bdeb565f22811e3eb4549e9b38a Mon Sep 17 00:00:00 2001 From: nitin Date: Wed, 13 Jul 2022 02:35:03 +0800 Subject: [PATCH 02/32] train pipeline running --- .../lib/multitask_sentsplitter_postagger.py | 129 ++++++++++++++---- 1 file changed, 100 insertions(+), 29 deletions(-) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index 7f0ae99..d744f0c 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -1,6 +1,9 @@ import conllu import torch import torch.nn as nn +import os +import shutil + from flair.embeddings import TransformerWordEmbeddings from flair.data import Sentence @@ -8,19 +11,20 @@ from lib.allennlp.time_distributed import TimeDistributed + +from time import time + class MTLModel(nn.Module): - def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=128): + def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=128,batchsize=32): super(MTLModel,self).__init__() - self.sbdtagset = ['B-SENT', 'O'] - self.postagset = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'] # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? + self.sbdtagset = {'B-SENT':0, 'O':1} + self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? - self.sequence_length = 256 - self.batch_size = 5 + self.sequence_length = 64 + self.batch_size = batchsize + self.encodertype = encodertype - # for CRF - self.START_TAG = "" - self.STOP_TAG = "" self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -28,7 +32,7 @@ def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3 self.transformerembeddings = TransformerWordEmbeddings(model='onlplab/alephbert-base',batch_size=self.batch_size,pooling_operation='mean').to(self.device) # Bi-LSTM Encoder - self.embeddingdim = 768 # based on BERT model + self.embeddingdim = 768 * 4 # based on BERT model with Flair layers self.rnndim = rnndim self.rnnnumlayers = rnnnumlayers self.rnnbidirectional = rnnbidirectional @@ -38,11 +42,11 @@ def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3 if encodertype == 'lstm': self.encoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.rnndim // 2, num_layers=self.rnnnumlayers, bidirectional=self.rnnbidirectional, - dropout=self.rnndropout).to(self.device) + dropout=self.rnndropout,batch_first=True).to(self.device) elif encodertype == 'gru': self.encoder = nn.GRU(input_size=self.embeddingdim, hidden_size=self.rnndim // 2, num_layers=self.rnnnumlayers, bidirectional=self.rnnbidirectional, - dropout=self.rnndropout).to(self.device) + dropout=self.rnndropout,batch_first=True).to(self.device) # Intermediate feedforward layer self.ffdim = ffdim @@ -50,15 +54,12 @@ def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3 # Label space for the pos tagger # TODO: CRF? - self.hidden2postag = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.postagset))).to(self.device) + self.hidden2postag = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys()))).to(self.device) # Label space for sent splitter - self.hidden2sbd = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.sbdtagset))).to(self.device) + self.hidden2sbd = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.sbdtagset.keys()))).to(self.device) + - # Linear CRF for sent splitter - self.sbd_tag2idx = {'B-SENT':0,'O':1,self.START_TAG:2,self.STOP_TAG:3} # AllenNLP CRF expects start and stop tags to be appended at the end, in that order - self.sbddtransitions = [(0,1),(1,0),(2,0),(2,1),(0,3),(1,3)] - self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx),self.sbddtransitions).to(self.device) def init_hidden(self): """ @@ -69,54 +70,100 @@ def init_hidden(self): else: numdir = 1 - return (torch.randn(self.rnnnumlayers * numdir, 1, self.rnndim // 2, device=self.device), - torch.randn(self.rnnnumlayers * numdir, 1, self.rnndim // 2, device=self.device)) + return (torch.randn(self.rnnnumlayers * numdir, self.batch_size, self.rnndim // 2, device=self.device), + torch.randn(self.rnnnumlayers * numdir, self.batch_size, self.rnndim // 2, device=self.device)) - def forward(self,slice): + def forward(self,data): """ slice is a list of tuples of length = seq_len. Each tuple is (token, pos tag, sentence boundary label) """ - sents = [' '.join([s.split('\t')[0] for s in sls]) for sls in slice] + start = time() + sents = [' '.join([s.split('\t')[0] for s in sls]) for sls in data] sentences = [] for sent in sents: sentences.append(Sentence(sent,use_tokenizer=False)) - sentences = self.transformerembeddings.embed(sentences) + sentences = self.transformerembeddings.embed(sentences) # subwords are averaged here + # Needs to extract the embeddings from Token objects, and add padding. embeddings = [] for sent in sentences: embedding = [] for tok in sent: embedding.append(tok.embedding) for _ in range(len(sent),self.sequence_length): - embedding.append(torch.zeros(768 * 4)) # for padding + embedding.append(torch.zeros(768 * 4)) # for padding. 4 because of the Flair TransFormerWordEmbeddings param embeddings.append(torch.stack(embedding)) - embeddings = torch.stack(embeddings) - print ('here') + embeddings = torch.stack(embeddings) # final embeddings in a tensor + print ('here timing') + print (time() - start) + + #if self.encodertype in ('lstm','gru'): + hidden = self.init_hidden() + feats, hidden = self.encoder(embeddings,hidden) + # Intermediate Feedforward layer + feats = self.fflayer(feats) + + # logits for pos + poslogits = self.hidden2postag(feats) + poslogits = poslogits.permute(0,2,1) + + # logits for sbd + sbdlogits = self.hidden2sbd(feats) + + return poslogits,sbdlogits class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=128): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=128,learningrate = 0.01): self.mtlmodel = MTLModel(rnndim,rnnnumlayers,rnnbidirectional,rnndropout,encodertype,ffdim) if trainflag == True: - import tensorboard + + from torch.utils.tensorboard import SummaryWriter + if os.path.isdir('../data/tensorboarddir/'): + shutil.rmtree('../data/tensorboarddir/') + os.mkdir('../data/tensorboarddir/') + + if not os.path.isdir('../data/checkpoint/'): + os.mkdir('../data/checkpoint/') + + self.writer = SummaryWriter('data/tensorboarddir/') + self.trainingdatafile = '../data/sentsplit_postag_train_gold.tab' self.devdatafile = '../data/sentsplit_postag_dev_gold.tab' else: self.testdatafile = '../data/sentsplit_postag_test_gold.tab' + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.trainflag = trainflag self.trainfile = trainfile self.devfile = devfile self.testfile = testfile + self.learningrate = learningrate + self.optimizer = torch.optim.AdamW(self.mtlmodel.parameters(), lr=learningrate) + + # Loss for pos tagging + self.postagloss = nn.CrossEntropyLoss() + self.postagloss.to(self.device) + + # Linear CRF Loss for sent splitter + # for CRF + self.START_TAG = "" + self.STOP_TAG = "" + self.sbd_tag2idx = {'B-SENT': 0, 'O': 1, self.START_TAG: 2, + self.STOP_TAG: 3} # AllenNLP CRF expects start and stop tags to be appended at the end, in that order + self.sbddtransitions = [(0, 1), (1, 0), (2, 0), (2, 1), (0, 3), (1, 3)] + self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx) - 2, self.sbddtransitions).to(self.device) + + def train(self): @@ -141,12 +188,36 @@ def read_file(mode='train'): return dataset + epochs = 1000 + trainingdata = read_file() devdata = read_file(mode='dev') + for epoch in range(1,epochs): + + data = trainingdata[0:self.mtlmodel.batch_size] + poslogits, sbdlogits = self.mtlmodel(data) + + sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] + sbdtags = [[self.sbd_tag2idx[t] for t in tag] for tag in sbdtags] + sbdtags = torch.LongTensor(sbdtags).to(self.device) + + postags = [[s.split('\t')[1].strip() for s in sls] for sls in data] + postags = [[self.mtlmodel.postagset[t] for t in tag] for tag in postags] + postags = torch.LongTensor(postags).to(self.device) + + posloss = self.postagloss(poslogits,postags) + sbdloss = self.sbdcrf(sbdlogits,sbdtags) * -1 + + mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? - self.mtlmodel(trainingdata[0:5]) + self.optimizer.zero_grad() + mtlloss.backward() + self.optimizer.step() + self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) + self.writer.add_scalar('train_sbd_loss', sbdloss.item(), epoch) + self.writer.add_scalar('train_joint_loss', mtlloss.item(), epoch) def predict(self): @@ -203,7 +274,7 @@ def main(): # testing only iahltwikitrain = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-train.conllu' iahltwikidev = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-dev.conllu' tagger = Tagger(trainflag=True,trainfile=iahltwikitrain,devfile=iahltwikidev) - tagger.prepare_data_files() + #tagger.prepare_data_files() tagger.train() print ('here') From 46b128992b685970312d21276c2dcf3ff4f04de3 Mon Sep 17 00:00:00 2001 From: nitin Date: Thu, 14 Jul 2022 03:59:12 +0800 Subject: [PATCH 03/32] pipeline with dev --- .../lib/multitask_sentsplitter_postagger.py | 236 ++++++++++++++---- 1 file changed, 188 insertions(+), 48 deletions(-) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index d744f0c..1d29657 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -3,14 +3,15 @@ import torch.nn as nn import os import shutil - +import flair from flair.embeddings import TransformerWordEmbeddings from flair.data import Sentence from lib.allennlp.conditional_random_field import ConditionalRandomField from lib.allennlp.time_distributed import TimeDistributed - - +from random import sample +from collections import defaultdict +from sklearn.metrics import f1_score, precision_score,recall_score from time import time @@ -21,7 +22,7 @@ def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3 self.sbdtagset = {'B-SENT':0, 'O':1} self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? - self.sequence_length = 64 + self.sequence_length = 32 self.batch_size = batchsize self.encodertype = encodertype @@ -29,7 +30,7 @@ def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3 self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Flair embeddings do subword pooling! - self.transformerembeddings = TransformerWordEmbeddings(model='onlplab/alephbert-base',batch_size=self.batch_size,pooling_operation='mean').to(self.device) + self.transformerembeddings = TransformerWordEmbeddings(model='onlplab/alephbert-base',batch_size=self.batch_size,pooling_operation='mean',fine_tune=True).to(self.device) # Bi-LSTM Encoder self.embeddingdim = 768 * 4 # based on BERT model with Flair layers @@ -54,7 +55,7 @@ def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3 # Label space for the pos tagger # TODO: CRF? - self.hidden2postag = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys()))).to(self.device) + #self.hidden2postag = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys()))).to(self.device) # Label space for sent splitter self.hidden2sbd = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.sbdtagset.keys()))).to(self.device) @@ -79,28 +80,30 @@ def forward(self,data): slice is a list of tuples of length = seq_len. Each tuple is (token, pos tag, sentence boundary label) """ - start = time() - sents = [' '.join([s.split('\t')[0] for s in sls]) for sls in data] - + #start = time() sentences = [] - for sent in sents: + for sent in data: sentences.append(Sentence(sent,use_tokenizer=False)) + #startlookup = time() sentences = self.transformerembeddings.embed(sentences) # subwords are averaged here + #print ('embedding lookup timing') + #print (time() - startlookup) - # Needs to extract the embeddings from Token objects, and add padding. + # need to extract the embeddings from Token objects, and add padding. embeddings = [] for sent in sentences: embedding = [] for tok in sent: embedding.append(tok.embedding) for _ in range(len(sent),self.sequence_length): - embedding.append(torch.zeros(768 * 4)) # for padding. 4 because of the Flair TransFormerWordEmbeddings param - embeddings.append(torch.stack(embedding)) + embedding.append(torch.zeros(768 * 4,device=self.device)) # for padding. 4 because of the Flair TransFormerWordEmbeddings param + embeddings.append(torch.stack(embedding).to(self.device)) - embeddings = torch.stack(embeddings) # final embeddings in a tensor - print ('here timing') - print (time() - start) + + embeddings = torch.stack(embeddings).to(self.device) # final embeddings in a tensor + #print ('embedding timing') + #print (time() - start) #if self.encodertype in ('lstm','gru'): hidden = self.init_hidden() @@ -110,13 +113,13 @@ def forward(self,data): feats = self.fflayer(feats) # logits for pos - poslogits = self.hidden2postag(feats) - poslogits = poslogits.permute(0,2,1) + #poslogits = self.hidden2postag(feats) + #poslogits = poslogits.permute(0,2,1) # logits for sbd sbdlogits = self.hidden2sbd(feats) - return poslogits,sbdlogits + return None,sbdlogits class Tagger(): def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=128,learningrate = 0.01): @@ -133,7 +136,7 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd if not os.path.isdir('../data/checkpoint/'): os.mkdir('../data/checkpoint/') - self.writer = SummaryWriter('data/tensorboarddir/') + self.writer = SummaryWriter('../data/tensorboarddir/') self.trainingdatafile = '../data/sentsplit_postag_train_gold.tab' self.devdatafile = '../data/sentsplit_postag_dev_gold.tab' @@ -155,36 +158,120 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd self.postagloss.to(self.device) # Linear CRF Loss for sent splitter - # for CRF self.START_TAG = "" self.STOP_TAG = "" self.sbd_tag2idx = {'B-SENT': 0, 'O': 1, self.START_TAG: 2, self.STOP_TAG: 3} # AllenNLP CRF expects start and stop tags to be appended at the end, in that order self.sbddtransitions = [(0, 1), (1, 0), (2, 0), (2, 1), (0, 3), (1, 3)] - self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx) - 2, self.sbddtransitions).to(self.device) + self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx) - 2, self.sbddtransitions).to(self.device) # dont include the START and STOP tags in the label count + self.evalstep = 1 + self.stride_size = 10 - def train(self): - def read_file(mode='train'): + def shingle_predict(self,toks,labels=None,type='sbd'): + + """ + Shingles data, then predicts the tag. Applies to dev and test sets only + pass labels if they exist e.g for dev / test Otherwise it's inference on new data. + pass type for the type of label, sbd or pos + """ - if mode == 'train': - file = self.trainingdatafile + spans = [] + if labels: + labelspans = [] + final_mapping = {} + # Hack tokens up into overlapping shingles + wraparound = toks[-self.stride_size:] + toks + toks[: self.mtlmodel.sequence_length] + if labels: + labelwraparound = labels[-self.stride_size:] + labels + labels[: self.mtlmodel.sequence_length] + idx = 0 + mapping = defaultdict(set) + snum = 0 + while idx < len(toks): + if idx + self.mtlmodel.sequence_length < len(wraparound): + span = wraparound[idx: idx + self.mtlmodel.sequence_length] + if labels: + labelspan = labelwraparound[idx: idx + self.mtlmodel.sequence_length] else: - file = self.devdatafile + span = wraparound[idx:] + if labels: + labelspan = labelwraparound[idx:] + sent = " ".join(span) + spans.append(sent) + if labels: + if type == 'sbd': + label = [self.mtlmodel.sbdtagset[l.strip()] for l in labelspan] + labelspans.append(label) + + for i in range(idx - self.stride_size, idx + self.mtlmodel.sequence_length - self.stride_size): + # start, end, snum + if i >= 0 and i < len(toks): + mapping[i].add((idx - self.stride_size, idx + self.mtlmodel.sequence_length - self.stride_size, snum)) + idx += self.stride_size + snum += 1 + + labelspans = torch.LongTensor(labelspans).to(self.device) + + for idx in mapping: + best = self.mtlmodel.sequence_length + for m in mapping[idx]: + start, end, snum = m + dist_to_end = end - idx + dist_to_start = idx - start + delta = abs(dist_to_end - dist_to_start) + if delta < best: + best = delta + final_mapping[idx] = (snum, idx - start) # Get sentence number and position in sentence + + self.mtlmodel.batch_size = len(spans) + start = time() + _, sbdlogits = self.mtlmodel(spans) + print ('dev processing time') + print (time() - start) + + loss = self.sbdcrf(sbdlogits,labelspans) * -1 + viterbi_tags = self.sbdcrf.viterbi_tags(sbdlogits) - dataset = [] - with open(file,'r') as fi: - lines = fi.readlines() - # split into contiguous sequence of seq_len length - for idx in range(0,len(lines),self.mtlmodel.sequence_length): - if idx + self.mtlmodel.sequence_length >= len(lines): - slice = lines[idx:len(lines)] - else: - slice = lines[idx:idx + self.mtlmodel.sequence_length] + labels = [] + for idx in final_mapping: + snum, position = final_mapping[idx] + label = 0 if viterbi_tags[snum][0][position] == 1 else 1 # B-SENT = 0, O = 1 - dataset.append(slice) + labels.append(label) + + return loss.item(), labels + + def train(self): + + def read_file(mode='train'): + + if mode == 'train' or mode == 'dev': # get sequences split across the seq_len parameter. No shingling. + dataset = [] + if mode == 'dev': + file = self.devdatafile + else: + file = self.trainingdatafile + with open(file,'r') as fi: + lines = fi.readlines() + # split into contiguous sequence of seq_len length + for idx in range(0,len(lines),self.mtlmodel.sequence_length): + if idx + self.mtlmodel.sequence_length >= len(lines): + slice = lines[idx:len(lines)] + else: + slice = lines[idx:idx + self.mtlmodel.sequence_length] + + dataset.append(slice) + else: + # get a long list of all tokens for shingling and prediction if not training. + if mode == 'dev': + file = self.devdatafile + else: + file = self.testdatafile + with open(file,'r') as fi: + lines = fi.readlines() + dataset = [l.strip() for l in lines] return dataset @@ -195,29 +282,82 @@ def read_file(mode='train'): for epoch in range(1,epochs): - data = trainingdata[0:self.mtlmodel.batch_size] - poslogits, sbdlogits = self.mtlmodel(data) + data = sample(trainingdata,self.mtlmodel.batch_size) + data = [datum for datum in data if len(datum) == self.mtlmodel.sequence_length] + self.mtlmodel.batch_size = len(data) + + sents = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] + + _, sbdlogits = self.mtlmodel(sents) sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] sbdtags = [[self.sbd_tag2idx[t] for t in tag] for tag in sbdtags] sbdtags = torch.LongTensor(sbdtags).to(self.device) - postags = [[s.split('\t')[1].strip() for s in sls] for sls in data] - postags = [[self.mtlmodel.postagset[t] for t in tag] for tag in postags] - postags = torch.LongTensor(postags).to(self.device) + #postags = [[s.split('\t')[1].strip() for s in sls] for sls in data] + #postags = [[self.mtlmodel.postagset[t] for t in tag] for tag in postags] + #postags = torch.LongTensor(postags).to(self.device) - posloss = self.postagloss(poslogits,postags) - sbdloss = self.sbdcrf(sbdlogits,sbdtags) * -1 + #posloss = self.postagloss(poslogits,postags) + sbdloss = self.sbdcrf(sbdlogits,sbdtags) - mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? + #mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? self.optimizer.zero_grad() - mtlloss.backward() + #mtlloss.backward() + sbdloss.backward() self.optimizer.step() - self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) + #self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) self.writer.add_scalar('train_sbd_loss', sbdloss.item(), epoch) - self.writer.add_scalar('train_joint_loss', mtlloss.item(), epoch) + #self.writer.add_scalar('train_joint_loss', mtlloss.item(), epoch) + + if epoch % self.evalstep == 0: + + self.mtlmodel.eval() + old_batch_size = self.mtlmodel.batch_size + + data = [datum for datum in devdata if len(datum) == self.mtlmodel.sequence_length] + self.mtlmodel.batch_size = len(data) + + sents = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] + + sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] + sbdtags = [[self.sbd_tag2idx[t] for t in tag] for tag in sbdtags] + goldlabels = [t for tags in sbdtags for t in tags] + sbdtags = torch.LongTensor(sbdtags).to(self.device) + + _, sbdlogits = self.mtlmodel(sents) + + #spans = [s.split('\t')[0].strip() for s in devdata] + #labels = [s.split('\t')[2].strip() for s in devdata] + + #devloss, predictions = self.shingle_predict(spans,labels) + devloss = self.sbdcrf(sbdlogits,sbdtags) + tags = self.sbdcrf.viterbi_tags(sbdlogits) + preds = [] + for tag in tags: + preds.extend(tag[0]) + + + #labels = [self.mtlmodel.sbdtagset[l] for l in labels] + + f1 = f1_score(goldlabels,preds) + precision = precision_score(goldlabels,preds) + recall = recall_score(goldlabels,preds) + + self.writer.add_scalar("dev_loss",round(devloss.item(),2),int(epoch / self.evalstep)) + self.writer.add_scalar("dev_f1", round(f1,2), int(epoch / self.evalstep)) + self.writer.add_scalar("dev_precision", round(precision, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("dev_recall", round(recall, 2), int(epoch / self.evalstep)) + + + print ('dev f1:' + str(f1)) + print('dev precision:' + str(precision)) + print('dev recall:' + str(recall)) + + self.mtlmodel.train() + self.mtlmodel.batch_size = old_batch_size def predict(self): From 294edc2b9682de6dfebad4fab5b63a9c0e4f56f2 Mon Sep 17 00:00:00 2001 From: nitin Date: Fri, 15 Jul 2022 00:14:25 +0800 Subject: [PATCH 04/32] sbd working..somewhat --- .../lib/multitask_sentsplitter_postagger.py | 233 +++++++++++------- 1 file changed, 141 insertions(+), 92 deletions(-) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index 1d29657..32bf06c 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -4,9 +4,11 @@ import os import shutil import flair +import random from flair.embeddings import TransformerWordEmbeddings from flair.data import Sentence +from transformers import BertModel,BertTokenizerFast from lib.allennlp.conditional_random_field import ConditionalRandomField from lib.allennlp.time_distributed import TimeDistributed from random import sample @@ -16,30 +18,31 @@ from time import time class MTLModel(nn.Module): - def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=128,batchsize=32): + def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,batchsize=16): super(MTLModel,self).__init__() - self.sbdtagset = {'B-SENT':0, 'O':1} + self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? - self.sequence_length = 32 + self.sequence_length = 128 self.batch_size = batchsize self.encodertype = encodertype - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base') + self.model = BertModel.from_pretrained('onlplab/alephbert-base').to(self.device) + # Flair embeddings do subword pooling! - self.transformerembeddings = TransformerWordEmbeddings(model='onlplab/alephbert-base',batch_size=self.batch_size,pooling_operation='mean',fine_tune=True).to(self.device) + #self.transformerembeddings = TransformerWordEmbeddings(model='onlplab/alephbert-base',batch_size=self.batch_size,pooling_operation='mean',fine_tune=True,layers="-1").to(self.device) # Bi-LSTM Encoder - self.embeddingdim = 768 * 4 # based on BERT model with Flair layers + self.embeddingdim = 768 * 1 # based on BERT model with Flair layers self.rnndim = rnndim self.rnnnumlayers = rnnnumlayers self.rnnbidirectional = rnnbidirectional self.rnndropout = rnndropout - if encodertype == 'lstm': self.encoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.rnndim // 2, num_layers=self.rnnnumlayers, bidirectional=self.rnnbidirectional, @@ -49,30 +52,52 @@ def __init__(self,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3 num_layers=self.rnnnumlayers, bidirectional=self.rnnbidirectional, dropout=self.rnndropout,batch_first=True).to(self.device) + # param init + for name, param in self.encoder.named_parameters(): + if 'bias' in name: + nn.init.constant_(param,0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + + self.relu = nn.ReLU() + + # Intermediate feedforward layer self.ffdim = ffdim self.fflayer = TimeDistributed(nn.Linear(in_features=self.rnndim,out_features=self.ffdim)).to(self.device) + # param init + for name, param in self.fflayer.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + # Label space for the pos tagger # TODO: CRF? #self.hidden2postag = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys()))).to(self.device) - + self.sbd_tag2idx = {'B-SENT': 1, + 'O': 0} # self.START_TAG: 2,self.STOP_TAG: 3} # AllenNLP CRF expects start and stop tags to be appended at the end, in that order # Label space for sent splitter - self.hidden2sbd = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.sbdtagset.keys()))).to(self.device) + self.hidden2sbd = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.sbd_tag2idx.keys())).to(self.device)) + # param init + for name, param in self.hidden2sbd.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + + self.sbddtransitions = [(0, 1), (1, 0)] - def init_hidden(self): """ - Used by RNN-type encoders + #self.START_TAG = "" + #self.STOP_TAG = "" + + self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx), self.sbddtransitions,include_start_end_transitions=False).to( + self.device) # dont include the START and STOP tags in the label count """ - if self.rnnbidirectional == True: - numdir = 2 - else: - numdir = 1 - - return (torch.randn(self.rnnnumlayers * numdir, self.batch_size, self.rnndim // 2, device=self.device), - torch.randn(self.rnnnumlayers * numdir, self.batch_size, self.rnndim // 2, device=self.device)) def forward(self,data): @@ -80,37 +105,52 @@ def forward(self,data): slice is a list of tuples of length = seq_len. Each tuple is (token, pos tag, sentence boundary label) """ + data = [d.split() for d in data] # for AlephBERT + tokens = self.tokenizer(data,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. Otherwise its own subword tokenization messes things up. + + embeddings = self.model(**tokens) + embeddings = embeddings[0] + + """ + Average the subword embeddings + This process will drop the [CLS],[SEP] and [PAD] tokens + """ #start = time() - sentences = [] - for sent in data: - sentences.append(Sentence(sent,use_tokenizer=False)) - - #startlookup = time() - sentences = self.transformerembeddings.embed(sentences) # subwords are averaged here - #print ('embedding lookup timing') - #print (time() - startlookup) - - # need to extract the embeddings from Token objects, and add padding. - embeddings = [] - for sent in sentences: - embedding = [] - for tok in sent: - embedding.append(tok.embedding) - for _ in range(len(sent),self.sequence_length): - embedding.append(torch.zeros(768 * 4,device=self.device)) # for padding. 4 because of the Flair TransFormerWordEmbeddings param - embeddings.append(torch.stack(embedding).to(self.device)) - - - embeddings = torch.stack(embeddings).to(self.device) # final embeddings in a tensor - #print ('embedding timing') + avgembeddings = [] + for k in range(0,len(tokens.encodings)): + emb = [] + maxindex = max([w for w in tokens.encodings[k].words if w]) + assert maxindex == self.sequence_length - 1 # otherwise won't average correctly and align with labels + + for i in range(0,self.sequence_length): + + indices = [j for j,x in enumerate(tokens.encodings[k].words) if x == i] + if len(indices) == 0: # This strange case needs to be handled. + emb.append(torch.zeros(768,device=self.device)) + elif len(indices) == 1: # no need to average + emb.append(embeddings[k][indices[0]]) + else: # needs to aggregate - average + slice = embeddings[k][indices[0]:indices[-1] + 1] + slice = torch.mean(input=slice,dim=0,keepdim=False) + emb.append(slice) + + + assert len(emb) == self.sequence_length # averaging was correct and aligns with the labels + + emb = torch.stack(emb) + avgembeddings.append(emb) + + avgembeddings = torch.stack(avgembeddings) + #print ('average embeddings') #print (time() - start) + #if self.encodertype in ('lstm','gru'): - hidden = self.init_hidden() - feats, hidden = self.encoder(embeddings,hidden) + feats, _ = self.encoder(avgembeddings) # Intermediate Feedforward layer feats = self.fflayer(feats) + feats = self.relu(feats) # logits for pos #poslogits = self.hidden2postag(feats) @@ -118,11 +158,16 @@ def forward(self,data): # logits for sbd sbdlogits = self.hidden2sbd(feats) + sbdlogits = sbdlogits.permute(0,2,1) + + #sbdloss = self.sbdcrf(sbdlogits, sbdtags) + #viterbitags = self.sbdcrf.viterbi_tags(sbdlogits) - return None,sbdlogits + #return None,sbdloss,viterbitags + return sbdlogits class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=128,rnnnumlayers=1,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=128,learningrate = 0.01): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,learningrate = 0.0001): self.mtlmodel = MTLModel(rnndim,rnnnumlayers,rnnbidirectional,rnndropout,encodertype,ffdim) @@ -151,24 +196,26 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd self.testfile = testfile self.learningrate = learningrate - self.optimizer = torch.optim.AdamW(self.mtlmodel.parameters(), lr=learningrate) # Loss for pos tagging - self.postagloss = nn.CrossEntropyLoss() - self.postagloss.to(self.device) + #self.postagloss = nn.CrossEntropyLoss() + #self.postagloss.to(self.device) - # Linear CRF Loss for sent splitter - self.START_TAG = "" - self.STOP_TAG = "" - self.sbd_tag2idx = {'B-SENT': 0, 'O': 1, self.START_TAG: 2, - self.STOP_TAG: 3} # AllenNLP CRF expects start and stop tags to be appended at the end, in that order - self.sbddtransitions = [(0, 1), (1, 0), (2, 0), (2, 1), (0, 3), (1, 3)] - self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx) - 2, self.sbddtransitions).to(self.device) # dont include the START and STOP tags in the label count + self.sbdloss = nn.CrossEntropyLoss() + self.sbdloss.to(self.device) - self.evalstep = 1 + self.optimizer = torch.optim.Adam(list(self.mtlmodel.encoder.parameters()) + list(self.mtlmodel.fflayer.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) + self.evalstep = 20 self.stride_size = 10 + self.set_seed(42) + + def set_seed(self, seed): + + random.seed(seed) + torch.manual_seed(seed) + def shingle_predict(self,toks,labels=None,type='sbd'): @@ -231,8 +278,8 @@ def shingle_predict(self,toks,labels=None,type='sbd'): print ('dev processing time') print (time() - start) - loss = self.sbdcrf(sbdlogits,labelspans) * -1 - viterbi_tags = self.sbdcrf.viterbi_tags(sbdlogits) + loss = self.mtlmodel.sbdcrf(sbdlogits,labelspans) * -1 + viterbi_tags = self.mtlmodel.sbdcrf.viterbi_tags(sbdlogits) labels = [] for idx in final_mapping: @@ -255,6 +302,7 @@ def read_file(mode='train'): file = self.trainingdatafile with open(file,'r') as fi: lines = fi.readlines() + lines = list(reversed(lines)) # hebrew is right to left... # split into contiguous sequence of seq_len length for idx in range(0,len(lines),self.mtlmodel.sequence_length): if idx + self.mtlmodel.sequence_length >= len(lines): @@ -275,35 +323,37 @@ def read_file(mode='train'): return dataset - epochs = 1000 + epochs = 2000 trainingdata = read_file() devdata = read_file(mode='dev') for epoch in range(1,epochs): + self.mtlmodel.train() + self.optimizer.zero_grad() + data = sample(trainingdata,self.mtlmodel.batch_size) data = [datum for datum in data if len(datum) == self.mtlmodel.sequence_length] self.mtlmodel.batch_size = len(data) sents = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] - _, sbdlogits = self.mtlmodel(sents) - sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] - sbdtags = [[self.sbd_tag2idx[t] for t in tag] for tag in sbdtags] + sbdtags = [[self.mtlmodel.sbd_tag2idx[t] for t in tag] for tag in sbdtags] sbdtags = torch.LongTensor(sbdtags).to(self.device) + sbdlogits = self.mtlmodel(sents) + #postags = [[s.split('\t')[1].strip() for s in sls] for sls in data] #postags = [[self.mtlmodel.postagset[t] for t in tag] for tag in postags] #postags = torch.LongTensor(postags).to(self.device) #posloss = self.postagloss(poslogits,postags) - sbdloss = self.sbdcrf(sbdlogits,sbdtags) + sbdloss = self.sbdloss(sbdlogits,sbdtags) - #mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? - self.optimizer.zero_grad() + #mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? #mtlloss.backward() sbdloss.backward() self.optimizer.step() @@ -315,46 +365,45 @@ def read_file(mode='train'): if epoch % self.evalstep == 0: self.mtlmodel.eval() - old_batch_size = self.mtlmodel.batch_size - data = [datum for datum in devdata if len(datum) == self.mtlmodel.sequence_length] - self.mtlmodel.batch_size = len(data) + with torch.no_grad(): + old_batch_size = self.mtlmodel.batch_size - sents = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] + data = [datum for datum in devdata if len(datum) == self.mtlmodel.sequence_length] + self.mtlmodel.batch_size = len(data) - sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] - sbdtags = [[self.sbd_tag2idx[t] for t in tag] for tag in sbdtags] - goldlabels = [t for tags in sbdtags for t in tags] - sbdtags = torch.LongTensor(sbdtags).to(self.device) + sents = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] - _, sbdlogits = self.mtlmodel(sents) + sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] + sbdtags = [[self.mtlmodel.sbd_tag2idx[t] for t in tag] for tag in sbdtags] + goldlabels = [t for tags in sbdtags for t in tags] + sbdtags = torch.LongTensor(sbdtags).to(self.device) - #spans = [s.split('\t')[0].strip() for s in devdata] - #labels = [s.split('\t')[2].strip() for s in devdata] + sbdlogits = self.mtlmodel(sents) + devloss = self.sbdloss(sbdlogits,sbdtags) - #devloss, predictions = self.shingle_predict(spans,labels) - devloss = self.sbdcrf(sbdlogits,sbdtags) - tags = self.sbdcrf.viterbi_tags(sbdlogits) - preds = [] - for tag in tags: - preds.extend(tag[0]) + #spans = [s.split('\t')[0].strip() for s in devdata] + #labels = [s.split('\t')[2].strip() for s in devdata] + #devloss, predictions = self.shingle_predict(spans,labels) + preds = torch.flatten(torch.argmax(sbdlogits,1)) + preds = preds.tolist() - #labels = [self.mtlmodel.sbdtagset[l] for l in labels] + #labels = [self.mtlmodel.sbdtagset[l] for l in labels] - f1 = f1_score(goldlabels,preds) - precision = precision_score(goldlabels,preds) - recall = recall_score(goldlabels,preds) + f1 = f1_score(goldlabels,preds) + precision = precision_score(goldlabels,preds) + recall = recall_score(goldlabels,preds) - self.writer.add_scalar("dev_loss",round(devloss.item(),2),int(epoch / self.evalstep)) - self.writer.add_scalar("dev_f1", round(f1,2), int(epoch / self.evalstep)) - self.writer.add_scalar("dev_precision", round(precision, 2), int(epoch / self.evalstep)) - self.writer.add_scalar("dev_recall", round(recall, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("dev_loss",round(devloss.item(),2),int(epoch / self.evalstep)) + self.writer.add_scalar("dev_f1", round(f1,2), int(epoch / self.evalstep)) + self.writer.add_scalar("dev_precision", round(precision, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("dev_recall", round(recall, 2), int(epoch / self.evalstep)) - print ('dev f1:' + str(f1)) - print('dev precision:' + str(precision)) - print('dev recall:' + str(recall)) + print ('dev f1:' + str(f1)) + print('dev precision:' + str(precision)) + print('dev recall:' + str(recall)) self.mtlmodel.train() self.mtlmodel.batch_size = old_batch_size From 7322665d804f2e0ad9963b23ec2ad48cc8d534d0 Mon Sep 17 00:00:00 2001 From: nitin Date: Fri, 15 Jul 2022 23:18:48 +0800 Subject: [PATCH 05/32] shingling --- .../lib/multitask_sentsplitter_postagger.py | 252 ++++++++++++------ 1 file changed, 169 insertions(+), 83 deletions(-) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index 32bf06c..ccf562b 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -5,6 +5,8 @@ import shutil import flair import random +import math +import gc from flair.embeddings import TransformerWordEmbeddings from flair.data import Sentence @@ -17,25 +19,48 @@ from time import time +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256,garbage_collection_threshold:0.2" + + +class PositionalEncoding(nn.Module): + + def __init__(self, d_model, dropout=0.1, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer('pe', pe) + + def forward(self, x): + x = x + self.pe[:x.size(0), :] + return self.dropout(x) + + + class MTLModel(nn.Module): - def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,batchsize=16): + def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,batchsize=8,transformernumlayers=6,nhead=8,sequencelength=128): super(MTLModel,self).__init__() + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + #self.device = 'cpu' + #self.START_TAG = "" + #self.STOP_TAG = "" self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? + self.sbd_tag2idx = {'B-SENT': 1,'O': 0,}#self.START_TAG:2,self.STOP_TAG:3} - self.sequence_length = 128 + self.sequence_length = sequencelength self.batch_size = batchsize self.encodertype = encodertype - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self.tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base') self.model = BertModel.from_pretrained('onlplab/alephbert-base').to(self.device) - # Flair embeddings do subword pooling! - #self.transformerembeddings = TransformerWordEmbeddings(model='onlplab/alephbert-base',batch_size=self.batch_size,pooling_operation='mean',fine_tune=True,layers="-1").to(self.device) - # Bi-LSTM Encoder self.embeddingdim = 768 * 1 # based on BERT model with Flair layers self.rnndim = rnndim @@ -51,20 +76,31 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 self.encoder = nn.GRU(input_size=self.embeddingdim, hidden_size=self.rnndim // 2, num_layers=self.rnnnumlayers, bidirectional=self.rnnbidirectional, dropout=self.rnndropout,batch_first=True).to(self.device) + elif self.encodertype == 'transformer': + self.transformernumlayers = transformernumlayers + self.nhead = nhead + self.encoderlayer = nn.TransformerEncoderLayer(d_model= self.embeddingdim,nhead=nhead).to(self.device) + self.encoder = nn.TransformerEncoder(self.encoderlayer,num_layers=self.transformernumlayers).to(self.device) + self.posencoder = PositionalEncoding(d_model=self.embeddingdim).to(self.device) # param init for name, param in self.encoder.named_parameters(): - if 'bias' in name: + try: + if 'bias' in name: + nn.init.constant_(param,0.0) + elif 'weight' in name: + nn.init.xavier_uniform_(param) + except ValueError as ex: nn.init.constant_(param,0.0) - elif 'weight' in name: - nn.init.xavier_normal_(param) self.relu = nn.ReLU() - # Intermediate feedforward layer self.ffdim = ffdim - self.fflayer = TimeDistributed(nn.Linear(in_features=self.rnndim,out_features=self.ffdim)).to(self.device) + if self.encodertype == 'transformer': + self.fflayer = TimeDistributed(nn.Linear(in_features=self.embeddingdim, out_features=self.ffdim)).to(self.device) + else: + self.fflayer = TimeDistributed(nn.Linear(in_features=self.rnndim, out_features=self.ffdim)).to(self.device) # param init for name, param in self.fflayer.named_parameters(): @@ -76,8 +112,7 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 # Label space for the pos tagger # TODO: CRF? #self.hidden2postag = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys()))).to(self.device) - self.sbd_tag2idx = {'B-SENT': 1, - 'O': 0} # self.START_TAG: 2,self.STOP_TAG: 3} # AllenNLP CRF expects start and stop tags to be appended at the end, in that order + # Label space for sent splitter self.hidden2sbd = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.sbd_tag2idx.keys())).to(self.device)) @@ -88,23 +123,17 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 elif 'weight' in name: nn.init.xavier_normal_(param) - self.sbddtransitions = [(0, 1), (1, 0)] + self.sigmoid = nn.Sigmoid() + #self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx),include_start_end_transitions=False).to(self.device) - """ - #self.START_TAG = "" - #self.STOP_TAG = "" - - self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx), self.sbddtransitions,include_start_end_transitions=False).to( - self.device) # dont include the START and STOP tags in the label count - """ def forward(self,data): """ slice is a list of tuples of length = seq_len. Each tuple is (token, pos tag, sentence boundary label) """ - + badrecords = [] data = [d.split() for d in data] # for AlephBERT tokens = self.tokenizer(data,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. Otherwise its own subword tokenization messes things up. @@ -120,7 +149,12 @@ def forward(self,data): for k in range(0,len(tokens.encodings)): emb = [] maxindex = max([w for w in tokens.encodings[k].words if w]) - assert maxindex == self.sequence_length - 1 # otherwise won't average correctly and align with labels + try: + assert maxindex == self.sequence_length - 1 # otherwise won't average correctly and align with labels + except AssertionError: + print ('max index not equal sequence len. Skipping.') + badrecords.append(k) + continue for i in range(0,self.sequence_length): @@ -135,18 +169,27 @@ def forward(self,data): emb.append(slice) - assert len(emb) == self.sequence_length # averaging was correct and aligns with the labels + try: + assert len(emb) == self.sequence_length # averaging was correct and aligns with the labels + except AssertionError: + print ('embedding not built correctly. Skipping') + badrecords.append(k) + continue emb = torch.stack(emb) avgembeddings.append(emb) avgembeddings = torch.stack(avgembeddings) + #print ('average embeddings') #print (time() - start) + if self.encodertype in ('lstm','gru'): + feats, _ = self.encoder(avgembeddings) + else: + feats = self.posencoder(avgembeddings) + feats = self.encoder(feats) - #if self.encodertype in ('lstm','gru'): - feats, _ = self.encoder(avgembeddings) # Intermediate Feedforward layer feats = self.fflayer(feats) @@ -158,13 +201,23 @@ def forward(self,data): # logits for sbd sbdlogits = self.hidden2sbd(feats) + #sbdlogits = torch.squeeze(sbdlogits) + sbdlogits = sbdlogits.permute(0,2,1) #sbdloss = self.sbdcrf(sbdlogits, sbdtags) #viterbitags = self.sbdcrf.viterbi_tags(sbdlogits) #return None,sbdloss,viterbitags - return sbdlogits + + del embeddings + del avgembeddings + del feats + + gc.collect() + torch.cuda.empty_cache() + + return sbdlogits,badrecords class Tagger(): def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,learningrate = 0.0001): @@ -189,6 +242,7 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd self.testdatafile = '../data/sentsplit_postag_test_gold.tab' self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + #self.device = 'cpu' self.trainflag = trainflag self.trainfile = trainfile @@ -201,10 +255,11 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd #self.postagloss = nn.CrossEntropyLoss() #self.postagloss.to(self.device) - self.sbdloss = nn.CrossEntropyLoss() + self.sbdloss = nn.CrossEntropyLoss() #(weight=torch.FloatTensor([1,5])) self.sbdloss.to(self.device) - self.optimizer = torch.optim.Adam(list(self.mtlmodel.encoder.parameters()) + list(self.mtlmodel.fflayer.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) + self.optimizer = torch.optim.Adam(list(self.mtlmodel.encoder.parameters()) + list(self.mtlmodel.fflayer.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) + #self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[500,2500],gamma=0.01) self.evalstep = 20 self.stride_size = 10 @@ -248,9 +303,7 @@ def shingle_predict(self,toks,labels=None,type='sbd'): sent = " ".join(span) spans.append(sent) if labels: - if type == 'sbd': - label = [self.mtlmodel.sbdtagset[l.strip()] for l in labelspan] - labelspans.append(label) + labelspans.append(labelspan) for i in range(idx - self.stride_size, idx + self.mtlmodel.sequence_length - self.stride_size): # start, end, snum @@ -259,7 +312,7 @@ def shingle_predict(self,toks,labels=None,type='sbd'): idx += self.stride_size snum += 1 - labelspans = torch.LongTensor(labelspans).to(self.device) + for idx in mapping: best = self.mtlmodel.sequence_length @@ -273,57 +326,72 @@ def shingle_predict(self,toks,labels=None,type='sbd'): final_mapping[idx] = (snum, idx - start) # Get sentence number and position in sentence self.mtlmodel.batch_size = len(spans) - start = time() - _, sbdlogits = self.mtlmodel(spans) - print ('dev processing time') - print (time() - start) - loss = self.mtlmodel.sbdcrf(sbdlogits,labelspans) * -1 - viterbi_tags = self.mtlmodel.sbdcrf.viterbi_tags(sbdlogits) + sbdlogits,badrecords = self.mtlmodel(spans) + predictions = torch.argmax(sbdlogits,dim=1) + + badrecords = sorted(badrecords,reverse=True) + for record in badrecords: + labelspans.pop(record) + + labelspans = torch.LongTensor(labelspans).to(self.device) labels = [] for idx in final_mapping: snum, position = final_mapping[idx] - label = 0 if viterbi_tags[snum][0][position] == 1 else 1 # B-SENT = 0, O = 1 + label = predictions[snum][position].item() labels.append(label) - return loss.item(), labels + loss = self.sbdloss(sbdlogits,labelspans) + + del sbdlogits + del labelspans + + #gc.collect() + torch.cuda.empty_cache() + + #print (torch.cuda.memory_stats("cuda:0")) + + + return labels,loss.item() def train(self): def read_file(mode='train'): - if mode == 'train' or mode == 'dev': # get sequences split across the seq_len parameter. No shingling. - dataset = [] - if mode == 'dev': - file = self.devdatafile - else: - file = self.trainingdatafile - with open(file,'r') as fi: + dataset = [] + if mode == 'dev': + with open(self.devdatafile, 'r') as fi: lines = fi.readlines() - lines = list(reversed(lines)) # hebrew is right to left... - # split into contiguous sequence of seq_len length - for idx in range(0,len(lines),self.mtlmodel.sequence_length): + lines = list(reversed(lines)) # hebrew is right to left... + + # shingle it here to get more training data + for idx in range(0, len(lines), 128): if idx + self.mtlmodel.sequence_length >= len(lines): slice = lines[idx:len(lines)] else: - slice = lines[idx:idx + self.mtlmodel.sequence_length] + slice = lines[idx: idx + self.mtlmodel.sequence_length] dataset.append(slice) + else: - # get a long list of all tokens for shingling and prediction if not training. - if mode == 'dev': - file = self.devdatafile - else: - file = self.testdatafile - with open(file,'r') as fi: + with open(self.trainingdatafile,'r') as fi: lines = fi.readlines() - dataset = [l.strip() for l in lines] + lines = list(reversed(lines)) # hebrew is right to left... + + # shingle it here to get more training data + for idx in range(0,len(lines),self.stride_size): + if idx + self.mtlmodel.sequence_length >= len(lines): + slice = lines[idx:len(lines)] + else: + slice = lines[idx: idx + self.mtlmodel.sequence_length] + + dataset.append(slice) return dataset - epochs = 2000 + epochs = 3000 trainingdata = read_file() devdata = read_file(mode='dev') @@ -341,22 +409,21 @@ def read_file(mode='train'): sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] sbdtags = [[self.mtlmodel.sbd_tag2idx[t] for t in tag] for tag in sbdtags] - sbdtags = torch.LongTensor(sbdtags).to(self.device) - sbdlogits = self.mtlmodel(sents) + sbdlogits,badrecords = self.mtlmodel(sents) + badrecords = sorted(badrecords,reverse=True) - #postags = [[s.split('\t')[1].strip() for s in sls] for sls in data] - #postags = [[self.mtlmodel.postagset[t] for t in tag] for tag in postags] - #postags = torch.LongTensor(postags).to(self.device) + for record in badrecords: + sbdtags.pop(record) - #posloss = self.postagloss(poslogits,postags) + sbdtags = torch.LongTensor(sbdtags).to(self.device) sbdloss = self.sbdloss(sbdlogits,sbdtags) - #mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? #mtlloss.backward() sbdloss.backward() self.optimizer.step() + #self.scheduler.step() #self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) self.writer.add_scalar('train_sbd_loss', sbdloss.item(), epoch) @@ -367,8 +434,9 @@ def read_file(mode='train'): self.mtlmodel.eval() with torch.no_grad(): - old_batch_size = self.mtlmodel.batch_size + """ + old_batch_size = self.mtlmodel.batch_size data = [datum for datum in devdata if len(datum) == self.mtlmodel.sequence_length] self.mtlmodel.batch_size = len(data) @@ -378,24 +446,39 @@ def read_file(mode='train'): sbdtags = [[self.mtlmodel.sbd_tag2idx[t] for t in tag] for tag in sbdtags] goldlabels = [t for tags in sbdtags for t in tags] sbdtags = torch.LongTensor(sbdtags).to(self.device) + """ + + + totaldevloss = 0 + allpreds = [] + allgold = [] + for slice in devdata: + sents = [s.split('\t')[0].strip() for s in slice] + goldlabels = [s.split('\t')[2].strip() for s in slice] + goldlabels = [self.mtlmodel.sbd_tag2idx[s] for s in goldlabels] + + preds,devloss = self.shingle_predict(sents,goldlabels) + totaldevloss += devloss + allpreds.extend(preds) + allgold.extend(goldlabels) + + + + """ sbdlogits = self.mtlmodel(sents) devloss = self.sbdloss(sbdlogits,sbdtags) - #spans = [s.split('\t')[0].strip() for s in devdata] - #labels = [s.split('\t')[2].strip() for s in devdata] - - #devloss, predictions = self.shingle_predict(spans,labels) - preds = torch.flatten(torch.argmax(sbdlogits,1)) + preds = torch.argmax(sbdlogits,dim=1) preds = preds.tolist() + preds = [p for pred in preds for p in pred] + """ - #labels = [self.mtlmodel.sbdtagset[l] for l in labels] - - f1 = f1_score(goldlabels,preds) - precision = precision_score(goldlabels,preds) - recall = recall_score(goldlabels,preds) + f1 = f1_score(allgold,allpreds) + precision = precision_score(allgold,allpreds) + recall = recall_score(allgold,allpreds) - self.writer.add_scalar("dev_loss",round(devloss.item(),2),int(epoch / self.evalstep)) + self.writer.add_scalar("dev_loss",round(totaldevloss,2),int(epoch / self.evalstep)) self.writer.add_scalar("dev_f1", round(f1,2), int(epoch / self.evalstep)) self.writer.add_scalar("dev_precision", round(precision, 2), int(epoch / self.evalstep)) self.writer.add_scalar("dev_recall", round(recall, 2), int(epoch / self.evalstep)) @@ -405,8 +488,9 @@ def read_file(mode='train'): print('dev precision:' + str(precision)) print('dev recall:' + str(recall)) + torch.cuda.empty_cache() self.mtlmodel.train() - self.mtlmodel.batch_size = old_batch_size + #self.mtlmodel.batch_size = old_batch_size def predict(self): @@ -462,12 +546,14 @@ def main(): # testing only iahltwikitrain = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-train.conllu' iahltwikidev = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-dev.conllu' + + tagger = Tagger(trainflag=True,trainfile=iahltwikitrain,devfile=iahltwikidev) - #tagger.prepare_data_files() + tagger.prepare_data_files() tagger.train() print ('here') if __name__ == "__main__": - main() \ No newline at end of file + main() From eccd68b017165adb9273b97402597d18f40574ff Mon Sep 17 00:00:00 2001 From: nitin Date: Sun, 17 Jul 2022 11:37:38 +0800 Subject: [PATCH 06/32] working crf --- hebpipe/lib/crfutils/crf.py | 48 ++++ hebpipe/lib/crfutils/viterbi.py | 241 ++++++++++++++++++ .../lib/multitask_sentsplitter_postagger.py | 102 +++++--- 3 files changed, 352 insertions(+), 39 deletions(-) create mode 100644 hebpipe/lib/crfutils/crf.py create mode 100644 hebpipe/lib/crfutils/viterbi.py diff --git a/hebpipe/lib/crfutils/crf.py b/hebpipe/lib/crfutils/crf.py new file mode 100644 index 0000000..aa99d17 --- /dev/null +++ b/hebpipe/lib/crfutils/crf.py @@ -0,0 +1,48 @@ +import torch + +START_TAG: str = "" +STOP_TAG: str = "" + +class CRF(torch.nn.Module): + """ + Conditional Random Field Implementation according to sgrvinod (https://github.com/sgrvinod). + Classifier which predicts single tag / class / label for given word based on not just the word, + but also on previous seen annotations. + """ + + def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool): + """ + :param tag_dictionary: tag dictionary in order to find ID for start and stop tags + :param tagset_size: number of tag from tag dictionary + :param init_from_state_dict: whether we load pretrained model from state dict + """ + super(CRF, self).__init__() + + self.tagset_size = tagset_size + + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + # Transitions are used in the following way: transitions[to, from]. + self.transitions = torch.nn.Parameter(torch.randn(tagset_size, tagset_size)) + # If we are not using a pretrained model and train a fresh one, we need to set transitions from any tag + # to START-tag and from STOP-tag to any other tag to -10000. + if not init_from_state_dict: + self.transitions.detach()[tag_dictionary.get_idx_for_item(START_TAG), :] = -10000 + + self.transitions.detach()[:, tag_dictionary.get_idx_for_item(STOP_TAG)] = -10000 + self.to(self.device) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Forward propagation of Conditional Random Field. + :param features: output from RNN / Linear layer in shape (batch size, seq len, hidden size) + :return: CRF scores (emission scores for each token + transitions prob from previous state) in + shape (batch_size, seq len, tagset size, tagset size) + """ + batch_size, seq_len = features.size()[:2] + + emission_scores = features + emission_scores = emission_scores.unsqueeze(-1).expand(batch_size, seq_len, self.tagset_size, self.tagset_size) + + crf_scores = emission_scores + self.transitions.unsqueeze(0).unsqueeze(0) + return crf_scores \ No newline at end of file diff --git a/hebpipe/lib/crfutils/viterbi.py b/hebpipe/lib/crfutils/viterbi.py new file mode 100644 index 0000000..75f7f56 --- /dev/null +++ b/hebpipe/lib/crfutils/viterbi.py @@ -0,0 +1,241 @@ +from typing import Tuple + +import numpy as np +import torch +import torch.nn +from torch.nn.functional import softmax +from torch.nn.utils.rnn import pack_padded_sequence + +import flair +from flair.data import Dictionary, Label, List, Sentence + +START_TAG: str = "" +STOP_TAG: str = "" + + +class ViterbiLoss(torch.nn.Module): + """ + Calculates the loss for each sequence up to its length t. + """ + + def __init__(self, tag_dictionary: Dictionary): + """ + :param tag_dictionary: tag_dictionary of task + """ + super(ViterbiLoss, self).__init__() + self.tag_dictionary = tag_dictionary + self.tagset_size = len(tag_dictionary) + self.start_tag = tag_dictionary.get_idx_for_item(START_TAG) + self.stop_tag = tag_dictionary.get_idx_for_item(STOP_TAG) + + def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor: + """ + Forward propagation of Viterbi Loss + :param features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size), + lengths of sentences in batch, transitions from CRF + :param targets: true tags for sentences which will be converted to matrix indices. + :return: average Viterbi Loss over batch size + """ + features, lengths, transitions = features_tuple + + batch_size = features.size(0) + seq_len = features.size(1) + + targets, targets_matrix_indices = self._format_targets(targets, lengths) + targets_matrix_indices = torch.tensor(targets_matrix_indices, dtype=torch.long).unsqueeze(2).to(flair.device) + + # scores_at_targets[range(features.shape[0]), lengths.values -1] + # Squeeze crf scores matrices in 1-dim shape and gather scores at targets by matrix indices + scores_at_targets = torch.gather(features.view(batch_size, seq_len, -1), 2, targets_matrix_indices) + scores_at_targets = pack_padded_sequence(scores_at_targets, lengths, batch_first=True)[0] + transitions_to_stop = transitions[ + np.repeat(self.stop_tag, features.shape[0]), + [target[length - 1] for target, length in zip(targets, lengths)], + ] + gold_score = scores_at_targets.sum() + transitions_to_stop.sum() + + scores_upto_t = torch.zeros(batch_size, self.tagset_size, device=flair.device) + + for t in range(max(lengths)): + batch_size_t = sum( + [length > t for length in lengths] + ) # since batch is ordered, we can save computation time by reducing our effective batch_size + + if t == 0: + # Initially, get scores from tag to all other tags + scores_upto_t[:batch_size_t] = ( + scores_upto_t[:batch_size_t] + features[:batch_size_t, t, :, self.start_tag] + ) + else: + # We add scores at current timestep to scores accumulated up to previous timestep, and log-sum-exp + # Remember, the cur_tag of the previous timestep is the prev_tag of this timestep + scores_upto_t[:batch_size_t] = self._log_sum_exp( + features[:batch_size_t, t, :, :] + scores_upto_t[:batch_size_t].unsqueeze(1), dim=2 + ) + + all_paths_scores = self._log_sum_exp(scores_upto_t + transitions[self.stop_tag].unsqueeze(0), dim=1).sum() + + viterbi_loss = all_paths_scores - gold_score + + return viterbi_loss + + @staticmethod + def _log_sum_exp(tensor, dim): + """ + Calculates the log-sum-exponent of a tensor's dimension in a numerically stable way. + :param tensor: tensor + :param dim: dimension to calculate log-sum-exp of + :return: log-sum-exp + """ + m, _ = torch.max(tensor, dim) + m_expanded = m.unsqueeze(dim).expand_as(tensor) + return m + torch.log(torch.sum(torch.exp(tensor - m_expanded), dim)) + + def _format_targets(self, targets: torch.Tensor, lengths: torch.IntTensor): + """ + Formats targets into matrix indices. + CRF scores contain per sentence, per token a (tagset_size x tagset_size) matrix, containing emission score for + token j + transition prob from previous token i. Means, if we think of our rows as "to tag" and our columns + as "from tag", the matrix in cell [10,5] would contain the emission score for tag 10 + transition score + from previous tag 5 and could directly be addressed through the 1-dim indices (10 + tagset_size * 5) = 70, + if our tagset consists of 12 tags. + :param targets: targets as in tag dictionary + :param lengths: lengths of sentences in batch + """ + targets_per_sentence = [] + + targets_list = targets.tolist() + for cut in lengths: + targets_per_sentence.append(targets_list[:cut]) + targets_list = targets_list[cut:] + + for t in targets_per_sentence: + t += [self.tag_dictionary.get_idx_for_item(STOP_TAG)] * (int(lengths.max().item()) - len(t)) + + matrix_indices = list( + map( + lambda s: [self.tag_dictionary.get_idx_for_item(START_TAG) + (s[0] * self.tagset_size)] + + [s[i] + (s[i + 1] * self.tagset_size) for i in range(0, len(s) - 1)], + targets_per_sentence, + ) + ) + + return targets_per_sentence, matrix_indices + + +class ViterbiDecoder: + """ + Decodes a given sequence using the Viterbi algorithm. + """ + + def __init__(self, tag_dictionary: Dictionary): + """ + :param tag_dictionary: Dictionary of tags for sequence labeling task + """ + self.tag_dictionary = tag_dictionary + self.tagset_size = len(tag_dictionary) + self.start_tag = tag_dictionary.get_idx_for_item(START_TAG) + self.stop_tag = tag_dictionary.get_idx_for_item(STOP_TAG) + + def decode( + self, features_tuple: tuple, probabilities_for_all_classes: bool, sentences: List[Sentence] + ) -> Tuple[List, List]: + """ + Decoding function returning the most likely sequence of tags. + :param features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size), + lengths of sentence in batch, transitions of CRF + :param probabilities_for_all_classes: whether to return probabilities for all tags + :return: decoded sequences + """ + features, lengths, transitions = features_tuple + all_tags = [] + + batch_size = features.size(0) + seq_len = features.size(1) + + # Create a tensor to hold accumulated sequence scores at each current tag + scores_upto_t = torch.zeros(batch_size, seq_len + 1, self.tagset_size).to(flair.device) + # Create a tensor to hold back-pointers + # i.e., indices of the previous_tag that corresponds to maximum accumulated score at current tag + # Let pads be the tag index, since that was the last tag in the decoded sequence + backpointers = ( + torch.ones((batch_size, seq_len + 1, self.tagset_size), dtype=torch.long, device=flair.device) + * self.stop_tag + ) + + for t in range(seq_len): + batch_size_t = sum([length > t for length in lengths]) # effective batch size (sans pads) at this timestep + terminates = [i for i, length in enumerate(lengths) if length == t + 1] + + if t == 0: + scores_upto_t[:batch_size_t, t] = features[:batch_size_t, t, :, self.start_tag] + backpointers[:batch_size_t, t, :] = ( + torch.ones((batch_size_t, self.tagset_size), dtype=torch.long) * self.start_tag + ) + else: + # We add scores at current timestep to scores accumulated up to previous timestep, and + # choose the previous timestep that corresponds to the max. accumulated score for each current timestep + scores_upto_t[:batch_size_t, t], backpointers[:batch_size_t, t, :] = torch.max( + features[:batch_size_t, t, :, :] + scores_upto_t[:batch_size_t, t - 1].unsqueeze(1), dim=2 + ) + + # If sentence is over, add transition to STOP-tag + if terminates: + scores_upto_t[terminates, t + 1], backpointers[terminates, t + 1, :] = torch.max( + scores_upto_t[terminates, t].unsqueeze(1) + transitions[self.stop_tag].unsqueeze(0), dim=2 + ) + + # Decode/trace best path backwards + decoded = torch.zeros((batch_size, backpointers.size(1)), dtype=torch.long, device=flair.device) + pointer = torch.ones((batch_size, 1), dtype=torch.long, device=flair.device) * self.stop_tag + + for t in list(reversed(range(backpointers.size(1)))): + decoded[:, t] = torch.gather(backpointers[:, t, :], 1, pointer).squeeze(1) + pointer = decoded[:, t].unsqueeze(1) + + # Sanity check + assert torch.equal( + decoded[:, 0], torch.ones((batch_size), dtype=torch.long, device=flair.device) * self.start_tag + ) + + # remove start-tag and backscore to stop-tag + scores_upto_t = scores_upto_t[:, :-1, :] + decoded = decoded[:, 1:] + + # Max + Softmax to get confidence score for predicted label and append label to each token + scores = softmax(scores_upto_t, dim=2) + confidences = torch.max(scores, dim=2) + + tags = [] + for tag_seq, tag_seq_conf, length_seq in zip(decoded, confidences.values, lengths): + tags.append( + [ + (self.tag_dictionary.get_item_for_index(tag), conf.item()) + for tag, conf in list(zip(tag_seq, tag_seq_conf))[:length_seq] + ] + ) + + if probabilities_for_all_classes: + all_tags = self._all_scores_for_token(scores.cpu(), lengths, sentences) + + return tags, all_tags + + def _all_scores_for_token(self, scores: torch.Tensor, lengths: torch.IntTensor, sentences: List[Sentence]): + """ + Returns all scores for each tag in tag dictionary. + :param scores: Scores for current sentence. + """ + scores = scores.numpy() + prob_tags_per_sentence = [] + for scores_sentence, length, sentence in zip(scores, lengths, sentences): + scores_sentence = scores_sentence[:length] + prob_tags_per_sentence.append( + [ + [ + Label(token, self.tag_dictionary.get_item_for_index(score_id), score) + for score_id, score in enumerate(score_dist) + ] + for score_dist, token in zip(scores_sentence, sentence) + ] + ) + return prob_tags_per_sentence \ No newline at end of file diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index ccf562b..690b9e2 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -9,13 +9,16 @@ import gc from flair.embeddings import TransformerWordEmbeddings -from flair.data import Sentence +from flair.data import Sentence, Dictionary from transformers import BertModel,BertTokenizerFast from lib.allennlp.conditional_random_field import ConditionalRandomField from lib.allennlp.time_distributed import TimeDistributed from random import sample from collections import defaultdict from sklearn.metrics import f1_score, precision_score,recall_score +from lib.crfutils.crf import CRF +from lib.crfutils.viterbi import ViterbiDecoder,ViterbiLoss + from time import time @@ -43,16 +46,19 @@ def forward(self, x): class MTLModel(nn.Module): - def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,batchsize=8,transformernumlayers=6,nhead=8,sequencelength=128): + def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,batchsize=16,transformernumlayers=6,nhead=8,sequencelength=128): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - #self.device = 'cpu' - #self.START_TAG = "" - #self.STOP_TAG = "" self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? - self.sbd_tag2idx = {'B-SENT': 1,'O': 0,}#self.START_TAG:2,self.STOP_TAG:3} + self.sbd_tag2idx = {'B-SENT': 1,'O': 0,} + + self.sbdtagset = Dictionary() + for key in self.sbd_tag2idx.keys(): + self.sbdtagset.add_item(key.strip()) + self.sbdtagset.add_item("") + self.sbdtagset.add_item("") self.sequence_length = sequencelength self.batch_size = batchsize @@ -98,9 +104,11 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 # Intermediate feedforward layer self.ffdim = ffdim if self.encodertype == 'transformer': - self.fflayer = TimeDistributed(nn.Linear(in_features=self.embeddingdim, out_features=self.ffdim)).to(self.device) + #self.fflayer = TimeDistributed(nn.Linear(in_features=self.embeddingdim, out_features=self.ffdim)).to(self.device) + self.fflayer = nn.Linear(in_features=self.embeddingdim, out_features=self.ffdim).to(self.device) else: - self.fflayer = TimeDistributed(nn.Linear(in_features=self.rnndim, out_features=self.ffdim)).to(self.device) + #self.fflayer = TimeDistributed(nn.Linear(in_features=self.rnndim, out_features=self.ffdim)).to(self.device) + self.fflayer = nn.Linear(in_features=self.rnndim, out_features=self.ffdim).to(self.device) # param init for name, param in self.fflayer.named_parameters(): @@ -114,7 +122,7 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 #self.hidden2postag = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys()))).to(self.device) # Label space for sent splitter - self.hidden2sbd = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.sbd_tag2idx.keys())).to(self.device)) + self.hidden2sbd = nn.Linear(in_features=self.ffdim,out_features=len(self.sbdtagset)).to(self.device) # param init for name, param in self.hidden2sbd.named_parameters(): @@ -125,7 +133,9 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 self.sigmoid = nn.Sigmoid() - #self.sbdcrf = ConditionalRandomField(len(self.sbd_tag2idx),include_start_end_transitions=False).to(self.device) + + self.sbdcrf = CRF(self.sbdtagset,len(self.sbdtagset),init_from_state_dict=False) # TODO: parameterize + self.viterbidecoder = ViterbiDecoder(self.sbdtagset) def forward(self,data): @@ -201,26 +211,20 @@ def forward(self,data): # logits for sbd sbdlogits = self.hidden2sbd(feats) - #sbdlogits = torch.squeeze(sbdlogits) - - sbdlogits = sbdlogits.permute(0,2,1) + sbdlogits = self.sbdcrf(sbdlogits) + #sbdlogits = sbdlogits.permute(1,0,2) - #sbdloss = self.sbdcrf(sbdlogits, sbdtags) - #viterbitags = self.sbdcrf.viterbi_tags(sbdlogits) - - #return None,sbdloss,viterbitags del embeddings del avgembeddings del feats - gc.collect() torch.cuda.empty_cache() return sbdlogits,badrecords class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,learningrate = 0.0001): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,learningrate = 0.00001): self.mtlmodel = MTLModel(rnndim,rnnnumlayers,rnnbidirectional,rnndropout,encodertype,ffdim) @@ -255,10 +259,12 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd #self.postagloss = nn.CrossEntropyLoss() #self.postagloss.to(self.device) - self.sbdloss = nn.CrossEntropyLoss() #(weight=torch.FloatTensor([1,5])) - self.sbdloss.to(self.device) + #self.sbdloss = nn.CrossEntropyLoss(weight=torch.FloatTensor([1,5])) + #self.sbdloss.to(self.device) + + self.sbdloss = ViterbiLoss(self.mtlmodel.sbdtagset) - self.optimizer = torch.optim.Adam(list(self.mtlmodel.encoder.parameters()) + list(self.mtlmodel.fflayer.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) + self.optimizer = torch.optim.Adam(list(self.mtlmodel.encoder.parameters()) + list(self.mtlmodel.fflayer.parameters()) + list(self.mtlmodel.sbdcrf.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) #self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[500,2500],gamma=0.01) self.evalstep = 20 @@ -312,8 +318,6 @@ def shingle_predict(self,toks,labels=None,type='sbd'): idx += self.stride_size snum += 1 - - for idx in mapping: best = self.mtlmodel.sequence_length for m in mapping[idx]: @@ -327,23 +331,39 @@ def shingle_predict(self,toks,labels=None,type='sbd'): self.mtlmodel.batch_size = len(spans) + # get the loss sbdlogits,badrecords = self.mtlmodel(spans) - predictions = torch.argmax(sbdlogits,dim=1) + #predictions = torch.argmax(sbdlogits,dim=1) badrecords = sorted(badrecords,reverse=True) for record in badrecords: labelspans.pop(record) + self.mtlmodel.batch_size -= 1 + labelspans = [label for span in labelspans for label in span] labelspans = torch.LongTensor(labelspans).to(self.device) + lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size + lengths = torch.LongTensor(lengths).to(self.device) + + score = (sbdlogits, lengths, self.mtlmodel.sbdcrf.transitions) + sbdloss = self.sbdloss(score, labelspans) + + # now get the predictions + sents = [] + for span in spans: + sents.append(Sentence(span)) + + predictions, _ = self.mtlmodel.viterbidecoder.decode(score,False,sents) + labels = [] for idx in final_mapping: snum, position = final_mapping[idx] - label = predictions[snum][position].item() + label = self.mtlmodel.sbdtagset.get_idx_for_item(predictions[snum][position][0]) labels.append(label) - loss = self.sbdloss(sbdlogits,labelspans) + #loss = self.sbdloss(sbdlogits,labelspans) del sbdlogits del labelspans @@ -354,7 +374,7 @@ def shingle_predict(self,toks,labels=None,type='sbd'): #print (torch.cuda.memory_stats("cuda:0")) - return labels,loss.item() + return labels,sbdloss.item() def train(self): @@ -391,7 +411,7 @@ def read_file(mode='train'): return dataset - epochs = 3000 + epochs = 10000 trainingdata = read_file() devdata = read_file(mode='dev') @@ -407,17 +427,21 @@ def read_file(mode='train'): sents = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] - sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] - sbdtags = [[self.mtlmodel.sbd_tag2idx[t] for t in tag] for tag in sbdtags] - - sbdlogits,badrecords = self.mtlmodel(sents) - badrecords = sorted(badrecords,reverse=True) + sbdlogits, badrecords = self.mtlmodel(sents) + badrecords = sorted(badrecords, reverse=True) + sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] for record in badrecords: sbdtags.pop(record) + self.mtlmodel.batch_size -= 1 + + sbdtags = torch.tensor([self.mtlmodel.sbdtagset.get_idx_for_item(s) for sbd in sbdtags for s in sbd]) + + lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size + lengths = torch.LongTensor(lengths).to(self.device) + scores = (sbdlogits,lengths,self.mtlmodel.sbdcrf.transitions) - sbdtags = torch.LongTensor(sbdtags).to(self.device) - sbdloss = self.sbdloss(sbdlogits,sbdtags) + sbdloss = self.sbdloss(scores,sbdtags) #mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? #mtlloss.backward() @@ -456,7 +480,7 @@ def read_file(mode='train'): sents = [s.split('\t')[0].strip() for s in slice] goldlabels = [s.split('\t')[2].strip() for s in slice] - goldlabels = [self.mtlmodel.sbd_tag2idx[s] for s in goldlabels] + goldlabels = [self.mtlmodel.sbdtagset.get_idx_for_item(s) for s in goldlabels] preds,devloss = self.shingle_predict(sents,goldlabels) totaldevloss += devloss @@ -478,7 +502,7 @@ def read_file(mode='train'): precision = precision_score(allgold,allpreds) recall = recall_score(allgold,allpreds) - self.writer.add_scalar("dev_loss",round(totaldevloss,2),int(epoch / self.evalstep)) + self.writer.add_scalar("dev_loss",round(totaldevloss/len(devdata),2),int(epoch / self.evalstep)) self.writer.add_scalar("dev_f1", round(f1,2), int(epoch / self.evalstep)) self.writer.add_scalar("dev_precision", round(precision, 2), int(epoch / self.evalstep)) self.writer.add_scalar("dev_recall", round(recall, 2), int(epoch / self.evalstep)) @@ -487,8 +511,8 @@ def read_file(mode='train'): print ('dev f1:' + str(f1)) print('dev precision:' + str(precision)) print('dev recall:' + str(recall)) + print ('\n') - torch.cuda.empty_cache() self.mtlmodel.train() #self.mtlmodel.batch_size = old_batch_size From d0c50cc3e50ee84b663554ed828b3800f1c2b490 Mon Sep 17 00:00:00 2001 From: nitin Date: Tue, 19 Jul 2022 03:26:08 +0800 Subject: [PATCH 07/32] sentence splitting f1 0.93' --- hebpipe/lib/allennlp/__init__.py | 0 .../lib/allennlp/conditional_random_field.py | 639 ------------------ hebpipe/lib/allennlp/time_distributed.py | 79 --- .../lib/multitask_sentsplitter_postagger.py | 94 +-- 4 files changed, 26 insertions(+), 786 deletions(-) delete mode 100644 hebpipe/lib/allennlp/__init__.py delete mode 100644 hebpipe/lib/allennlp/conditional_random_field.py delete mode 100644 hebpipe/lib/allennlp/time_distributed.py diff --git a/hebpipe/lib/allennlp/__init__.py b/hebpipe/lib/allennlp/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/hebpipe/lib/allennlp/conditional_random_field.py b/hebpipe/lib/allennlp/conditional_random_field.py deleted file mode 100644 index 2982358..0000000 --- a/hebpipe/lib/allennlp/conditional_random_field.py +++ /dev/null @@ -1,639 +0,0 @@ -""" -Conditional random field -""" -import logging -import math -import torch - -from typing import List, Tuple, Dict, Union,Optional,Any, TypeVar - -logger = logging.getLogger(__name__) - -T = TypeVar("T") -StateDictType = Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"] - -VITERBI_DECODING = Tuple[List[int], float] # a list of tags, and a viterbi score - - -class ConfigurationError(Exception): - - """ - The exception raised by any AllenNLP object when it's misconfigured - (e.g. missing properties, invalid properties, unknown properties). - """ - - def __reduce__(self) -> Union[str, Tuple[Any, ...]]: - return type(self), (self.message,) - - def __init__(self, message: str): - super().__init__() - self.message = message - - def __str__(self): - return self.message - -def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> torch.Tensor: - """ - A numerically stable computation of logsumexp. This is mathematically equivalent to - `tensor.exp().sum(dim, keep=keepdim).log()`. This function is typically used for summing log - probabilities. - - # Parameters - - tensor : `torch.FloatTensor`, required. - A tensor of arbitrary size. - dim : `int`, optional (default = `-1`) - The dimension of the tensor to apply the logsumexp to. - keepdim: `bool`, optional (default = `False`) - Whether to retain a dimension of size one at the dimension we reduce over. - """ - max_score, _ = tensor.max(dim, keepdim=keepdim) - if keepdim: - stable_vec = tensor - max_score - else: - stable_vec = tensor - max_score.unsqueeze(dim) - return max_score + (stable_vec.exp().sum(dim, keepdim=keepdim)).log() - - -def viterbi_decode( - tag_sequence: torch.Tensor, - transition_matrix: torch.Tensor, - tag_observations: Optional[List[int]] = None, - allowed_start_transitions: torch.Tensor = None, - allowed_end_transitions: torch.Tensor = None, - top_k: int = None, -): - """ - Perform Viterbi decoding in log space over a sequence given a transition matrix - specifying pairwise (transition) potentials between tags and a matrix of shape - (sequence_length, num_tags) specifying unary potentials for possible tags per - timestep. - - # Parameters - - tag_sequence : `torch.Tensor`, required. - A tensor of shape (sequence_length, num_tags) representing scores for - a set of tags over a given sequence. - transition_matrix : `torch.Tensor`, required. - A tensor of shape (num_tags, num_tags) representing the binary potentials - for transitioning between a given pair of tags. - tag_observations : `Optional[List[int]]`, optional, (default = `None`) - A list of length `sequence_length` containing the class ids of observed - elements in the sequence, with unobserved elements being set to -1. Note that - it is possible to provide evidence which results in degenerate labelings if - the sequences of tags you provide as evidence cannot transition between each - other, or those transitions are extremely unlikely. In this situation we log a - warning, but the responsibility for providing self-consistent evidence ultimately - lies with the user. - allowed_start_transitions : `torch.Tensor`, optional, (default = `None`) - An optional tensor of shape (num_tags,) describing which tags the START token - may transition *to*. If provided, additional transition constraints will be used for - determining the start element of the sequence. - allowed_end_transitions : `torch.Tensor`, optional, (default = `None`) - An optional tensor of shape (num_tags,) describing which tags may transition *to* the - end tag. If provided, additional transition constraints will be used for determining - the end element of the sequence. - top_k : `int`, optional, (default = `None`) - Optional integer specifying how many of the top paths to return. For top_k>=1, returns - a tuple of two lists: top_k_paths, top_k_scores, For top_k==None, returns a flattened - tuple with just the top path and its score (not in lists, for backwards compatibility). - - # Returns - - viterbi_path : `List[int]` - The tag indices of the maximum likelihood tag sequence. - viterbi_score : `torch.Tensor` - The score of the viterbi path. - """ - if top_k is None: - top_k = 1 - flatten_output = True - elif top_k >= 1: - flatten_output = False - else: - raise ValueError(f"top_k must be either None or an integer >=1. Instead received {top_k}") - - sequence_length, num_tags = list(tag_sequence.size()) - - has_start_end_restrictions = ( - allowed_end_transitions is not None or allowed_start_transitions is not None - ) - - if has_start_end_restrictions: - - if allowed_end_transitions is None: - allowed_end_transitions = torch.zeros(num_tags) - if allowed_start_transitions is None: - allowed_start_transitions = torch.zeros(num_tags) - - num_tags = num_tags + 2 - new_transition_matrix = torch.zeros(num_tags, num_tags) - new_transition_matrix[:-2, :-2] = transition_matrix - - # Start and end transitions are fully defined, but cannot transition between each other. - - allowed_start_transitions = torch.cat( - [allowed_start_transitions, torch.tensor([-math.inf, -math.inf])] - ) - allowed_end_transitions = torch.cat( - [allowed_end_transitions, torch.tensor([-math.inf, -math.inf])] - ) - - # First define how we may transition FROM the start and end tags. - new_transition_matrix[-2, :] = allowed_start_transitions - # We cannot transition from the end tag to any tag. - new_transition_matrix[-1, :] = -math.inf - - new_transition_matrix[:, -1] = allowed_end_transitions - # We cannot transition to the start tag from any tag. - new_transition_matrix[:, -2] = -math.inf - - transition_matrix = new_transition_matrix - - if tag_observations: - if len(tag_observations) != sequence_length: - raise ConfigurationError( - "Observations were provided, but they were not the same length " - "as the sequence. Found sequence of length: {} and evidence: {}".format( - sequence_length, tag_observations - ) - ) - else: - tag_observations = [-1 for _ in range(sequence_length)] - - if has_start_end_restrictions: - tag_observations = [num_tags - 2] + tag_observations + [num_tags - 1] - zero_sentinel = torch.zeros(1, num_tags) - extra_tags_sentinel = torch.ones(sequence_length, 2) * -math.inf - tag_sequence = torch.cat([tag_sequence, extra_tags_sentinel], -1) - tag_sequence = torch.cat([zero_sentinel, tag_sequence, zero_sentinel], 0) - sequence_length = tag_sequence.size(0) - - path_scores = [] - path_indices = [] - - if tag_observations[0] != -1: - one_hot = torch.zeros(num_tags) - one_hot[tag_observations[0]] = 100000.0 - path_scores.append(one_hot.unsqueeze(0)) - else: - path_scores.append(tag_sequence[0, :].unsqueeze(0)) - - # Evaluate the scores for all possible paths. - for timestep in range(1, sequence_length): - # Add pairwise potentials to current scores. - summed_potentials = path_scores[timestep - 1].unsqueeze(2) + transition_matrix - summed_potentials = summed_potentials.view(-1, num_tags) - - # Best pairwise potential path score from the previous timestep. - max_k = min(summed_potentials.size()[0], top_k) - scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) - - # If we have an observation for this timestep, use it - # instead of the distribution over tags. - observation = tag_observations[timestep] - # Warn the user if they have passed - # invalid/extremely unlikely evidence. - if tag_observations[timestep - 1] != -1 and observation != -1: - if transition_matrix[tag_observations[timestep - 1], observation] < -10000: - logger.warning( - "The pairwise potential between tags you have passed as " - "observations is extremely unlikely. Double check your evidence " - "or transition potentials!" - ) - if observation != -1: - one_hot = torch.zeros(num_tags) - one_hot[observation] = 100000.0 - path_scores.append(one_hot.unsqueeze(0)) - else: - path_scores.append(tag_sequence[timestep, :] + scores) - path_indices.append(paths.squeeze()) - - # Construct the most likely sequence backwards. - path_scores_v = path_scores[-1].view(-1) - max_k = min(path_scores_v.size()[0], top_k) - viterbi_scores, best_paths = torch.topk(path_scores_v, k=max_k, dim=0) - viterbi_paths = [] - for i in range(max_k): - viterbi_path = [best_paths[i]] - for backward_timestep in reversed(path_indices): - viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]])) - # Reverse the backward path. - viterbi_path.reverse() - - if has_start_end_restrictions: - viterbi_path = viterbi_path[1:-1] - - # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. - viterbi_path = [j % num_tags for j in viterbi_path] - viterbi_paths.append(viterbi_path) - - if flatten_output: - return viterbi_paths[0], viterbi_scores[0] - - return viterbi_paths, viterbi_scores - - -def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]: - """ - Given labels and a constraint type, returns the allowed transitions. It will - additionally include transitions for the start and end states, which are used - by the conditional random field. - # Parameters - constraint_type : `str`, required - Indicates which constraint to apply. Current choices are - "BIO", "IOB1", "BIOUL", and "BMES". - labels : `Dict[int, str]`, required - A mapping {label_id -> label}. Most commonly this would be the value from - Vocabulary.get_index_to_token_vocabulary() - # Returns - `List[Tuple[int, int]]` - The allowed transitions (from_label_id, to_label_id). - """ - num_labels = len(labels) - start_tag = num_labels - end_tag = num_labels + 1 - labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")] - - allowed = [] - for from_label_index, from_label in labels_with_boundaries: - if from_label in ("START", "END"): - from_tag = from_label - from_entity = "" - else: - from_tag = from_label[0] - from_entity = from_label[1:] - for to_label_index, to_label in labels_with_boundaries: - if to_label in ("START", "END"): - to_tag = to_label - to_entity = "" - else: - to_tag = to_label[0] - to_entity = to_label[1:] - if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity): - allowed.append((from_label_index, to_label_index)) - return allowed - - - -def is_transition_allowed( - constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str -): - """ - Given a constraint type and strings `from_tag` and `to_tag` that - represent the origin and destination of the transition, return whether - the transition is allowed under the given constraint type. - # Parameters - constraint_type : `str`, required - Indicates which constraint to apply. Current choices are - "BIO", "IOB1", "BIOUL", and "BMES". - from_tag : `str`, required - The tag that the transition originates from. For example, if the - label is `I-PER`, the `from_tag` is `I`. - from_entity : `str`, required - The entity corresponding to the `from_tag`. For example, if the - label is `I-PER`, the `from_entity` is `PER`. - to_tag : `str`, required - The tag that the transition leads to. For example, if the - label is `I-PER`, the `to_tag` is `I`. - to_entity : `str`, required - The entity corresponding to the `to_tag`. For example, if the - label is `I-PER`, the `to_entity` is `PER`. - # Returns - `bool` - Whether the transition is allowed under the given `constraint_type`. - """ - - if to_tag == "START" or from_tag == "END": - # Cannot transition into START or from END - return False - - if constraint_type == "BIOUL": - if from_tag == "START": - return to_tag in ("O", "B", "U") - if to_tag == "END": - return from_tag in ("O", "L", "U") - return any( - [ - # O can transition to O, B-* or U-* - # L-x can transition to O, B-*, or U-* - # U-x can transition to O, B-*, or U-* - from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"), - # B-x can only transition to I-x or L-x - # I-x can only transition to I-x or L-x - from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity, - ] - ) - elif constraint_type == "BIO": - if from_tag == "START": - return to_tag in ("O", "B") - if to_tag == "END": - return from_tag in ("O", "B", "I") - return any( - [ - # Can always transition to O or B-x - to_tag in ("O", "B"), - # Can only transition to I-x from B-x or I-x - to_tag == "I" and from_tag in ("B", "I") and from_entity == to_entity, - ] - ) - elif constraint_type == "IOB1": - if from_tag == "START": - return to_tag in ("O", "I") - if to_tag == "END": - return from_tag in ("O", "B", "I") - return any( - [ - # Can always transition to O or I-x - to_tag in ("O", "I"), - # Can only transition to B-x from B-x or I-x, where - # x is the same tag. - to_tag == "B" and from_tag in ("B", "I") and from_entity == to_entity, - ] - ) - elif constraint_type == "BMES": - if from_tag == "START": - return to_tag in ("B", "S") - if to_tag == "END": - return from_tag in ("E", "S") - return any( - [ - # Can only transition to B or S from E or S. - to_tag in ("B", "S") and from_tag in ("E", "S"), - # Can only transition to M-x from B-x, where - # x is the same tag. - to_tag == "M" and from_tag in ("B", "M") and from_entity == to_entity, - # Can only transition to E-x from B-x or M-x, where - # x is the same tag. - to_tag == "E" and from_tag in ("B", "M") and from_entity == to_entity, - ] - ) - else: - raise ConfigurationError(f"Unknown constraint type: {constraint_type}") - - -class ConditionalRandomField(torch.nn.Module): - """ - This module uses the "forward-backward" algorithm to compute - the log-likelihood of its inputs assuming a conditional random field model. - See, e.g. http://www.cs.columbia.edu/~mcollins/fb.pdf - # Parameters - num_tags : `int`, required - The number of tags. - constraints : `List[Tuple[int, int]]`, optional (default = `None`) - An optional list of allowed transitions (from_tag_id, to_tag_id). - These are applied to `viterbi_tags()` but do not affect `forward()`. - These should be derived from `allowed_transitions` so that the - start and end transitions are handled correctly for your tag type. - include_start_end_transitions : `bool`, optional (default = `True`) - Whether to include the start and end transition parameters. - """ - - def __init__( - self, - num_tags: int, - constraints: List[Tuple[int, int]] = None, - include_start_end_transitions: bool = True, - ) -> None: - super().__init__() - self.num_tags = num_tags - - # transitions[i, j] is the logit for transitioning from state i to state j. - self.transitions = torch.nn.Parameter(torch.empty(num_tags, num_tags)) - - # _constraint_mask indicates valid transitions (based on supplied constraints). - # Include special start of sequence (num_tags + 1) and end of sequence tags (num_tags + 2) - if constraints is None: - # All transitions are valid. - constraint_mask = torch.full((num_tags + 2, num_tags + 2), 1.0) - else: - constraint_mask = torch.full((num_tags + 2, num_tags + 2), 0.0) - for i, j in constraints: - constraint_mask[i, j] = 1.0 - - self._constraint_mask = torch.nn.Parameter(constraint_mask, requires_grad=False) - - # Also need logits for transitioning from "start" state and to "end" state. - self.include_start_end_transitions = include_start_end_transitions - if include_start_end_transitions: - self.start_transitions = torch.nn.Parameter(torch.Tensor(num_tags)) - self.end_transitions = torch.nn.Parameter(torch.Tensor(num_tags)) - - self.reset_parameters() - - def reset_parameters(self): - torch.nn.init.xavier_normal_(self.transitions) - if self.include_start_end_transitions: - torch.nn.init.normal_(self.start_transitions) - torch.nn.init.normal_(self.end_transitions) - - def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: - """ - Computes the (batch_size,) denominator term for the log-likelihood, which is the - sum of the likelihoods across all possible state sequences. - """ - batch_size, sequence_length, num_tags = logits.size() - - # Transpose batch size and sequence dimensions - mask = mask.transpose(0, 1).contiguous() - logits = logits.transpose(0, 1).contiguous() - - # Initial alpha is the (batch_size, num_tags) tensor of likelihoods combining the - # transitions to the initial states and the logits for the first timestep. - if self.include_start_end_transitions: - alpha = self.start_transitions.view(1, num_tags) + logits[0] - else: - alpha = logits[0] - - # For each i we compute logits for the transitions from timestep i-1 to timestep i. - # We do so in a (batch_size, num_tags, num_tags) tensor where the axes are - # (instance, current_tag, next_tag) - for i in range(1, sequence_length): - # The emit scores are for time i ("next_tag") so we broadcast along the current_tag axis. - emit_scores = logits[i].view(batch_size, 1, num_tags) - # Transition scores are (current_tag, next_tag) so we broadcast along the instance axis. - transition_scores = self.transitions.view(1, num_tags, num_tags) - # Alpha is for the current_tag, so we broadcast along the next_tag axis. - broadcast_alpha = alpha.view(batch_size, num_tags, 1) - - # Add all the scores together and logexp over the current_tag axis. - inner = broadcast_alpha + emit_scores + transition_scores - - # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension - # of `inner`. Otherwise (mask == False) we want to retain the previous alpha. - alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * ( - ~mask[i] - ).view(batch_size, 1) - - # Every sequence needs to end with a transition to the stop_tag. - if self.include_start_end_transitions: - stops = alpha + self.end_transitions.view(1, num_tags) - else: - stops = alpha - - # Finally we log_sum_exp along the num_tags dim, result is (batch_size,) - return logsumexp(stops) - - def _joint_likelihood( - self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor - ) -> torch.Tensor: - """ - Computes the numerator term for the log-likelihood, which is just score(inputs, tags) - """ - batch_size, sequence_length, _ = logits.data.shape - - # Transpose batch size and sequence dimensions: - logits = logits.transpose(0, 1).contiguous() - mask = mask.transpose(0, 1).contiguous() - tags = tags.transpose(0, 1).contiguous() - - # Start with the transition scores from start_tag to the first tag in each input - if self.include_start_end_transitions: - score = self.start_transitions.index_select(0, tags[0]) - else: - score = 0.0 - - # Add up the scores for the observed transitions and all the inputs but the last - for i in range(sequence_length - 1): - # Each is shape (batch_size,) - current_tag, next_tag = tags[i], tags[i + 1] - - # The scores for transitioning from current_tag to next_tag - transition_score = self.transitions[current_tag.view(-1), next_tag.view(-1)] - - # The score for using current_tag - emit_score = logits[i].gather(1, current_tag.view(batch_size, 1)).squeeze(1) - - # Include transition score if next element is unmasked, - # input_score if this element is unmasked. - score = score + transition_score * mask[i + 1] + emit_score * mask[i] - - # Transition from last state to "stop" state. To start with, we need to find the last tag - # for each instance. - last_tag_index = mask.sum(0).long() - 1 - last_tags = tags.gather(0, last_tag_index.view(1, batch_size)).squeeze(0) - - # Compute score of transitioning to `stop_tag` from each "last tag". - if self.include_start_end_transitions: - last_transition_score = self.end_transitions.index_select(0, last_tags) - else: - last_transition_score = 0.0 - - # Add the last input if it's not masked. - last_inputs = logits[-1] # (batch_size, num_tags) - last_input_score = last_inputs.gather(1, last_tags.view(-1, 1)) # (batch_size, 1) - last_input_score = last_input_score.squeeze() # (batch_size,) - - score = score + last_transition_score + last_input_score * mask[-1] - - return score - - def forward( - self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None - ) -> torch.Tensor: - """ - Computes the log likelihood. - """ - - if mask is None: - mask = torch.ones(*tags.size(), dtype=torch.bool, device=inputs.device) - else: - # The code below fails in weird ways if this isn't a bool tensor, so we make sure. - mask = mask.to(torch.bool) - - log_denominator = self._input_likelihood(inputs, mask) - log_numerator = self._joint_likelihood(inputs, tags, mask) - - return torch.sum(log_numerator - log_denominator) - - def viterbi_tags( - self, logits: torch.Tensor, mask: torch.BoolTensor = None, top_k: int = None - ) -> Union[List[VITERBI_DECODING], List[List[VITERBI_DECODING]]]: - """ - Uses viterbi algorithm to find most likely tags for the given inputs. - If constraints are applied, disallows all other transitions. - Returns a list of results, of the same size as the batch (one result per batch member) - Each result is a List of length top_k, containing the top K viterbi decodings - Each decoding is a tuple (tag_sequence, viterbi_score) - For backwards compatibility, if top_k is None, then instead returns a flat list of - tag sequences (the top tag sequence for each batch item). - """ - if mask is None: - mask = torch.ones(*logits.shape[:2], dtype=torch.bool, device=logits.device) - - if top_k is None: - top_k = 1 - flatten_output = True - else: - flatten_output = False - - _, max_seq_length, num_tags = logits.size() - - # Get the tensors out of the variables - logits, mask = logits.data, mask.data - - # Augment transitions matrix with start and end transitions - start_tag = num_tags - end_tag = num_tags + 1 - transitions = torch.full((num_tags + 2, num_tags + 2), -10000.0, device=logits.device) - - # Apply transition constraints - constrained_transitions = self.transitions * self._constraint_mask[ - :num_tags, :num_tags - ] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags]) - transitions[:num_tags, :num_tags] = constrained_transitions.data - - if self.include_start_end_transitions: - transitions[ - start_tag, :num_tags - ] = self.start_transitions.detach() * self._constraint_mask[ - start_tag, :num_tags - ].data + -10000.0 * ( - 1 - self._constraint_mask[start_tag, :num_tags].detach() - ) - transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[ - :num_tags, end_tag - ].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach()) - else: - transitions[start_tag, :num_tags] = -10000.0 * ( - 1 - self._constraint_mask[start_tag, :num_tags].detach() - ) - transitions[:num_tags, end_tag] = -10000.0 * ( - 1 - self._constraint_mask[:num_tags, end_tag].detach() - ) - - best_paths = [] - # Pad the max sequence length by 2 to account for start_tag + end_tag. - tag_sequence = torch.empty(max_seq_length + 2, num_tags + 2, device=logits.device) - - for prediction, prediction_mask in zip(logits, mask): - mask_indices = prediction_mask.nonzero(as_tuple=False).squeeze() - masked_prediction = torch.index_select(prediction, 0, mask_indices) - sequence_length = masked_prediction.shape[0] - - # Start with everything totally unlikely - tag_sequence.fill_(-10000.0) - # At timestep 0 we must have the START_TAG - tag_sequence[0, start_tag] = 0.0 - # At steps 1, ..., sequence_length we just use the incoming prediction - tag_sequence[1 : (sequence_length + 1), :num_tags] = masked_prediction - # And at the last timestep we must have the END_TAG - tag_sequence[sequence_length + 1, end_tag] = 0.0 - - # We pass the tags and the transitions to `viterbi_decode`. - viterbi_paths, viterbi_scores = viterbi_decode( - tag_sequence=tag_sequence[: (sequence_length + 2)], - transition_matrix=transitions, - top_k=top_k, - ) - top_k_paths = [] - for viterbi_path, viterbi_score in zip(viterbi_paths, viterbi_scores): - # Get rid of START and END sentinels and append. - viterbi_path = viterbi_path[1:-1] - top_k_paths.append((viterbi_path, viterbi_score.item())) - best_paths.append(top_k_paths) - - if flatten_output: - return [top_k_paths[0] for top_k_paths in best_paths] - - return best_paths \ No newline at end of file diff --git a/hebpipe/lib/allennlp/time_distributed.py b/hebpipe/lib/allennlp/time_distributed.py deleted file mode 100644 index 7a4d86a..0000000 --- a/hebpipe/lib/allennlp/time_distributed.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -A wrapper that unrolls the second (time) dimension of a tensor -into the first (batch) dimension, applies some other `Module`, -and then rolls the time dimension back up. -""" - -from typing import List - - -import torch - - -class TimeDistributed(torch.nn.Module): - """ - Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes - inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be - `(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back. - Note that while the above gives shapes with `batch_size` first, this `Module` also works if - `batch_size` is second - we always just combine the first two dimensions, then split them. - It also reshapes keyword arguments unless they are not tensors or their name is specified in - the optional `pass_through` iterable. - """ - - def __init__(self, module): - super().__init__() - self._module = module - - def forward(self, *inputs, pass_through: List[str] = None, **kwargs): - - pass_through = pass_through or [] - - reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs] - - # Need some input to then get the batch_size and time_steps. - some_input = None - if inputs: - some_input = inputs[-1] - - reshaped_kwargs = {} - for key, value in kwargs.items(): - if isinstance(value, torch.Tensor) and key not in pass_through: - if some_input is None: - some_input = value - - value = self._reshape_tensor(value) - - reshaped_kwargs[key] = value - - reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs) - - if some_input is None: - raise RuntimeError("No input tensor to time-distribute") - - # Now get the output back into the right shape. - # (batch_size, time_steps, **output_size) - tuple_output = True - if not isinstance(reshaped_outputs, tuple): - tuple_output = False - reshaped_outputs = (reshaped_outputs,) - - outputs = [] - for reshaped_output in reshaped_outputs: - new_size = some_input.size()[:2] + reshaped_output.size()[1:] - outputs.append(reshaped_output.contiguous().view(new_size)) - - if not tuple_output: - outputs = outputs[0] - - return outputs - - @staticmethod - def _reshape_tensor(input_tensor): - input_size = input_tensor.size() - if len(input_size) <= 2: - raise RuntimeError(f"No dimension to distribute: {input_size}") - # Squash batch_size and time_steps into a single axis; result has shape - # (batch_size * time_steps, **input_size). - squashed_shape = [-1] + list(input_size[2:]) - return input_tensor.contiguous().view(*squashed_shape) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index 690b9e2..d2743b3 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -3,27 +3,21 @@ import torch.nn as nn import os import shutil -import flair import random import math -import gc -from flair.embeddings import TransformerWordEmbeddings from flair.data import Sentence, Dictionary from transformers import BertModel,BertTokenizerFast -from lib.allennlp.conditional_random_field import ConditionalRandomField -from lib.allennlp.time_distributed import TimeDistributed from random import sample from collections import defaultdict from sklearn.metrics import f1_score, precision_score,recall_score from lib.crfutils.crf import CRF from lib.crfutils.viterbi import ViterbiDecoder,ViterbiLoss - from time import time os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256,garbage_collection_threshold:0.2" - +SAMPLE_SIZE = 16 class PositionalEncoding(nn.Module): @@ -46,13 +40,13 @@ def forward(self, x): class MTLModel(nn.Module): - def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,batchsize=16,transformernumlayers=6,nhead=8,sequencelength=128): + def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,batchsize=SAMPLE_SIZE,transformernumlayers=6,nhead=8,sequencelength=128): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? - self.sbd_tag2idx = {'B-SENT': 1,'O': 0,} + self.sbd_tag2idx = {'B-SENT': 1,'O': 0} self.sbdtagset = Dictionary() for key in self.sbd_tag2idx.keys(): @@ -104,10 +98,8 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 # Intermediate feedforward layer self.ffdim = ffdim if self.encodertype == 'transformer': - #self.fflayer = TimeDistributed(nn.Linear(in_features=self.embeddingdim, out_features=self.ffdim)).to(self.device) self.fflayer = nn.Linear(in_features=self.embeddingdim, out_features=self.ffdim).to(self.device) else: - #self.fflayer = TimeDistributed(nn.Linear(in_features=self.rnndim, out_features=self.ffdim)).to(self.device) self.fflayer = nn.Linear(in_features=self.rnndim, out_features=self.ffdim).to(self.device) # param init @@ -119,7 +111,7 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 # Label space for the pos tagger # TODO: CRF? - #self.hidden2postag = TimeDistributed(nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys()))).to(self.device) + #self.hidden2postag = nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys())).to(self.device) # Label space for sent splitter self.hidden2sbd = nn.Linear(in_features=self.ffdim,out_features=len(self.sbdtagset)).to(self.device) @@ -133,16 +125,12 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 self.sigmoid = nn.Sigmoid() - self.sbdcrf = CRF(self.sbdtagset,len(self.sbdtagset),init_from_state_dict=False) # TODO: parameterize self.viterbidecoder = ViterbiDecoder(self.sbdtagset) def forward(self,data): - """ - slice is a list of tuples of length = seq_len. Each tuple is (token, pos tag, sentence boundary label) - """ badrecords = [] data = [d.split() for d in data] # for AlephBERT tokens = self.tokenizer(data,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. Otherwise its own subword tokenization messes things up. @@ -159,6 +147,7 @@ def forward(self,data): for k in range(0,len(tokens.encodings)): emb = [] maxindex = max([w for w in tokens.encodings[k].words if w]) + try: assert maxindex == self.sequence_length - 1 # otherwise won't average correctly and align with labels except AssertionError: @@ -189,7 +178,10 @@ def forward(self,data): emb = torch.stack(emb) avgembeddings.append(emb) - avgembeddings = torch.stack(avgembeddings) + if len(avgembeddings) > 0: + avgembeddings = torch.stack(avgembeddings) + else: + return None,badrecords #print ('average embeddings') #print (time() - start) @@ -212,7 +204,6 @@ def forward(self,data): # logits for sbd sbdlogits = self.hidden2sbd(feats) sbdlogits = self.sbdcrf(sbdlogits) - #sbdlogits = sbdlogits.permute(1,0,2) del embeddings @@ -224,7 +215,7 @@ def forward(self,data): return sbdlogits,badrecords class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,learningrate = 0.00001): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,learningrate = 0.0001): self.mtlmodel = MTLModel(rnndim,rnnnumlayers,rnnbidirectional,rnndropout,encodertype,ffdim) @@ -246,7 +237,6 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd self.testdatafile = '../data/sentsplit_postag_test_gold.tab' self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - #self.device = 'cpu' self.trainflag = trainflag self.trainfile = trainfile @@ -259,9 +249,6 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd #self.postagloss = nn.CrossEntropyLoss() #self.postagloss.to(self.device) - #self.sbdloss = nn.CrossEntropyLoss(weight=torch.FloatTensor([1,5])) - #self.sbdloss.to(self.device) - self.sbdloss = ViterbiLoss(self.mtlmodel.sbdtagset) self.optimizer = torch.optim.Adam(list(self.mtlmodel.encoder.parameters()) + list(self.mtlmodel.fflayer.parameters()) + list(self.mtlmodel.sbdcrf.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) @@ -333,13 +320,16 @@ def shingle_predict(self,toks,labels=None,type='sbd'): # get the loss sbdlogits,badrecords = self.mtlmodel(spans) - #predictions = torch.argmax(sbdlogits,dim=1) badrecords = sorted(badrecords,reverse=True) for record in badrecords: labelspans.pop(record) + spans.pop(record) self.mtlmodel.batch_size -= 1 + if len(spans) == 0: + return None, None + labelspans = [label for span in labelspans for label in span] labelspans = torch.LongTensor(labelspans).to(self.device) @@ -363,19 +353,14 @@ def shingle_predict(self,toks,labels=None,type='sbd'): labels.append(label) - #loss = self.sbdloss(sbdlogits,labelspans) - del sbdlogits del labelspans - #gc.collect() torch.cuda.empty_cache() - #print (torch.cuda.memory_stats("cuda:0")) - - return labels,sbdloss.item() + def train(self): def read_file(mode='train'): @@ -386,7 +371,6 @@ def read_file(mode='train'): lines = fi.readlines() lines = list(reversed(lines)) # hebrew is right to left... - # shingle it here to get more training data for idx in range(0, len(lines), 128): if idx + self.mtlmodel.sequence_length >= len(lines): slice = lines[idx:len(lines)] @@ -411,7 +395,7 @@ def read_file(mode='train'): return dataset - epochs = 10000 + epochs = 1000 trainingdata = read_file() devdata = read_file(mode='dev') @@ -421,7 +405,7 @@ def read_file(mode='train'): self.mtlmodel.train() self.optimizer.zero_grad() - data = sample(trainingdata,self.mtlmodel.batch_size) + data = sample(trainingdata,SAMPLE_SIZE) data = [datum for datum in data if len(datum) == self.mtlmodel.sequence_length] self.mtlmodel.batch_size = len(data) @@ -459,20 +443,6 @@ def read_file(mode='train'): with torch.no_grad(): - """ - old_batch_size = self.mtlmodel.batch_size - data = [datum for datum in devdata if len(datum) == self.mtlmodel.sequence_length] - self.mtlmodel.batch_size = len(data) - - sents = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] - - sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] - sbdtags = [[self.mtlmodel.sbd_tag2idx[t] for t in tag] for tag in sbdtags] - goldlabels = [t for tags in sbdtags for t in tags] - sbdtags = torch.LongTensor(sbdtags).to(self.device) - """ - - totaldevloss = 0 allpreds = [] allgold = [] @@ -483,21 +453,14 @@ def read_file(mode='train'): goldlabels = [self.mtlmodel.sbdtagset.get_idx_for_item(s) for s in goldlabels] preds,devloss = self.shingle_predict(sents,goldlabels) + if preds is None: + preds = [self.mtlmodel.sbdtagset.get_idx_for_item("O")] * len(goldlabels) + devloss = 0 + totaldevloss += devloss allpreds.extend(preds) allgold.extend(goldlabels) - - - """ - sbdlogits = self.mtlmodel(sents) - devloss = self.sbdloss(sbdlogits,sbdtags) - - preds = torch.argmax(sbdlogits,dim=1) - preds = preds.tolist() - preds = [p for pred in preds for p in pred] - """ - f1 = f1_score(allgold,allpreds) precision = precision_score(allgold,allpreds) recall = recall_score(allgold,allpreds) @@ -513,9 +476,6 @@ def read_file(mode='train'): print('dev recall:' + str(recall)) print ('\n') - self.mtlmodel.train() - #self.mtlmodel.batch_size = old_batch_size - def predict(self): pass @@ -533,15 +493,13 @@ def write_file(filename,mode='train'): with open(filename,'w') as tr: for sent in data: - for i in range(0,len(sent)): # This will disregard the final punct in each sentence. - - if isinstance(sent[i]['id'],tuple): continue + for i in range(0,len(sent)): + if isinstance(sent[i]['id'], tuple): continue # MWE conventions in the conllu file - if i == len(sent) - 2 and (sent[i + 1]['form'] == '.' and sent[i + 1]['upos'] == 'PUNCT'): + if sent[i]['id'] == 1: tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'B-SENT' + '\n') - elif i == len(sent) - 1 and (sent[i]['form'] != '.' and sent[i]['upos'] != 'PUNCT'): - tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'B-SENT' + '\n') - elif i != len(sent) - 1: + + else: tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'O' + '\n') traindata = self.read_conllu() From 93f131975c9aeba7c21d5ce5fa5223f84396a89a Mon Sep 17 00:00:00 2001 From: nitin Date: Tue, 19 Jul 2022 10:05:50 +0800 Subject: [PATCH 08/32] remove crf or sbd --- .../lib/multitask_sentsplitter_postagger.py | 162 +++++++++++++----- 1 file changed, 115 insertions(+), 47 deletions(-) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index d2743b3..57a9518 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -10,14 +10,48 @@ from transformers import BertModel,BertTokenizerFast from random import sample from collections import defaultdict -from sklearn.metrics import f1_score, precision_score,recall_score from lib.crfutils.crf import CRF from lib.crfutils.viterbi import ViterbiDecoder,ViterbiLoss from time import time -os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256,garbage_collection_threshold:0.2" -SAMPLE_SIZE = 16 +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256" +SAMPLE_SIZE = 32 + + +def spans_score(gold_spans, system_spans): + correct, gi, si = 0, 0, 0 + while gi < len(gold_spans) and si < len(system_spans): + if system_spans[si].start < gold_spans[gi].start: + si += 1 + elif gold_spans[gi].start < system_spans[si].start: + gi += 1 + else: + correct += gold_spans[gi].end == system_spans[si].end + si += 1 + gi += 1 + + return Score(len(gold_spans), len(system_spans), correct) + + +class Score: + def __init__(self, gold_total, system_total, correct, aligned_total=None): + self.correct = correct + self.gold_total = gold_total + self.system_total = system_total + self.aligned_total = aligned_total + self.precision = correct / system_total if system_total else 0.0 + self.recall = correct / gold_total if gold_total else 0.0 + self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 + self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total + + +class UDSpan: + def __init__(self, start, end): + self.start = start + # Note that self.end marks the first position **after the end** of span, + # so we can use characters[start:end] or range(start, end). + self.end = end class PositionalEncoding(nn.Module): @@ -40,7 +74,7 @@ def forward(self, x): class MTLModel(nn.Module): - def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,batchsize=SAMPLE_SIZE,transformernumlayers=6,nhead=8,sequencelength=128): + def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='gru',ffdim=512,batchsize=SAMPLE_SIZE,transformernumlayers=6,nhead=8,sequencelength=64): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -48,11 +82,13 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? self.sbd_tag2idx = {'B-SENT': 1,'O': 0} + """ self.sbdtagset = Dictionary() for key in self.sbd_tag2idx.keys(): self.sbdtagset.add_item(key.strip()) self.sbdtagset.add_item("") self.sbdtagset.add_item("") + """ self.sequence_length = sequencelength self.batch_size = batchsize @@ -114,7 +150,7 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 #self.hidden2postag = nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys())).to(self.device) # Label space for sent splitter - self.hidden2sbd = nn.Linear(in_features=self.ffdim,out_features=len(self.sbdtagset)).to(self.device) + self.hidden2sbd = nn.Linear(in_features=self.ffdim,out_features=len(self.sbd_tag2idx.keys())).to(self.device) # param init for name, param in self.hidden2sbd.named_parameters(): @@ -125,8 +161,8 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 self.sigmoid = nn.Sigmoid() - self.sbdcrf = CRF(self.sbdtagset,len(self.sbdtagset),init_from_state_dict=False) # TODO: parameterize - self.viterbidecoder = ViterbiDecoder(self.sbdtagset) + #self.sbdcrf = CRF(self.sbdtagset,len(self.sbdtagset),init_from_state_dict=False) # TODO: parameterize + #self.viterbidecoder = ViterbiDecoder(self.sbdtagset) def forward(self,data): @@ -203,7 +239,8 @@ def forward(self,data): # logits for sbd sbdlogits = self.hidden2sbd(feats) - sbdlogits = self.sbdcrf(sbdlogits) + sbdlogits = sbdlogits.permute(0,2,1) + #sbdlogits = self.sbdcrf(sbdlogits) del embeddings @@ -215,7 +252,7 @@ def forward(self,data): return sbdlogits,badrecords class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='lstm',ffdim=512,learningrate = 0.0001): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='gru',ffdim=512,learningrate = 0.001): self.mtlmodel = MTLModel(rnndim,rnnnumlayers,rnnbidirectional,rnndropout,encodertype,ffdim) @@ -249,13 +286,15 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd #self.postagloss = nn.CrossEntropyLoss() #self.postagloss.to(self.device) - self.sbdloss = ViterbiLoss(self.mtlmodel.sbdtagset) + #self.sbdloss = ViterbiLoss(self.mtlmodel.sbdtagset) + self.sbdloss = nn.CrossEntropyLoss(weight=torch.FloatTensor([1,3])) + self.sbdloss.to(self.device) - self.optimizer = torch.optim.Adam(list(self.mtlmodel.encoder.parameters()) + list(self.mtlmodel.fflayer.parameters()) + list(self.mtlmodel.sbdcrf.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) - #self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[500,2500],gamma=0.01) + self.optimizer = torch.optim.AdamW(list(self.mtlmodel.encoder.parameters()) + list(self.mtlmodel.fflayer.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) + self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[150,750],gamma=0.1) self.evalstep = 20 - self.stride_size = 10 + self.stride_size = 20 self.set_seed(42) @@ -330,26 +369,29 @@ def shingle_predict(self,toks,labels=None,type='sbd'): if len(spans) == 0: return None, None - labelspans = [label for span in labelspans for label in span] + #labelspans = [label for span in labelspans for label in span] labelspans = torch.LongTensor(labelspans).to(self.device) - lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size - lengths = torch.LongTensor(lengths).to(self.device) + #lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size + #lengths = torch.LongTensor(lengths).to(self.device) - score = (sbdlogits, lengths, self.mtlmodel.sbdcrf.transitions) - sbdloss = self.sbdloss(score, labelspans) + #score = (sbdlogits, lengths, self.mtlmodel.sbdcrf.transitions) + #sbdloss = self.sbdloss(score, labelspans) + sbdloss = self.sbdloss(sbdlogits,labelspans) # now get the predictions - sents = [] - for span in spans: - sents.append(Sentence(span)) + #sents = [] + #for span in spans: + # sents.append(Sentence(span)) - predictions, _ = self.mtlmodel.viterbidecoder.decode(score,False,sents) + #predictions, _ = self.mtlmodel.viterbidecoder.decode(score,False,sents) + predictions = torch.argmax(sbdlogits,dim=1) labels = [] for idx in final_mapping: snum, position = final_mapping[idx] - label = self.mtlmodel.sbdtagset.get_idx_for_item(predictions[snum][position][0]) + #label = self.mtlmodel.sbdtagset.get_idx_for_item(predictions[snum][position][0]) + label = predictions[snum][position] labels.append(label) @@ -369,9 +411,9 @@ def read_file(mode='train'): if mode == 'dev': with open(self.devdatafile, 'r') as fi: lines = fi.readlines() - lines = list(reversed(lines)) # hebrew is right to left... + #lines = list(reversed(lines)) # hebrew is right to left... - for idx in range(0, len(lines), 128): + for idx in range(0, len(lines), self.mtlmodel.sequence_length): if idx + self.mtlmodel.sequence_length >= len(lines): slice = lines[idx:len(lines)] else: @@ -379,19 +421,23 @@ def read_file(mode='train'): dataset.append(slice) + test = [d for slice in dataset for d in slice] + assert len(test) == len(lines) + else: with open(self.trainingdatafile,'r') as fi: lines = fi.readlines() - lines = list(reversed(lines)) # hebrew is right to left... + #lines = list(reversed(lines)) # hebrew is right to left... # shingle it here to get more training data - for idx in range(0,len(lines),self.stride_size): + for idx in range(0,len(lines),self.mtlmodel.sequence_length - self.stride_size): if idx + self.mtlmodel.sequence_length >= len(lines): slice = lines[idx:len(lines)] + dataset.append(slice) + break else: slice = lines[idx: idx + self.mtlmodel.sequence_length] - - dataset.append(slice) + dataset.append(slice) return dataset @@ -419,19 +465,22 @@ def read_file(mode='train'): sbdtags.pop(record) self.mtlmodel.batch_size -= 1 - sbdtags = torch.tensor([self.mtlmodel.sbdtagset.get_idx_for_item(s) for sbd in sbdtags for s in sbd]) + #sbdtags = torch.tensor([self.mtlmodel.sbdtagset.get_idx_for_item(s) for sbd in sbdtags for s in sbd]) + sbdtags = torch.tensor([[self.mtlmodel.sbd_tag2idx[s] for s in sbd] for sbd in sbdtags]).to(self.device) - lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size - lengths = torch.LongTensor(lengths).to(self.device) - scores = (sbdlogits,lengths,self.mtlmodel.sbdcrf.transitions) - sbdloss = self.sbdloss(scores,sbdtags) + #lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size + #lengths = torch.LongTensor(lengths).to(self.device) + #scores = (sbdlogits,lengths,self.mtlmodel.sbdcrf.transitions) + + #sbdloss = self.sbdloss(scores,sbdtags) + sbdloss = self.sbdloss(sbdlogits,sbdtags) #mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? #mtlloss.backward() sbdloss.backward() self.optimizer.step() - #self.scheduler.step() + self.scheduler.step() #self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) self.writer.add_scalar('train_sbd_loss', sbdloss.item(), epoch) @@ -440,40 +489,59 @@ def read_file(mode='train'): if epoch % self.evalstep == 0: self.mtlmodel.eval() - + start = time() with torch.no_grad(): totaldevloss = 0 allpreds = [] allgold = [] + invalidlabelscount = 0 for slice in devdata: sents = [s.split('\t')[0].strip() for s in slice] goldlabels = [s.split('\t')[2].strip() for s in slice] - goldlabels = [self.mtlmodel.sbdtagset.get_idx_for_item(s) for s in goldlabels] + #goldlabels = [self.mtlmodel.sbdtagset.get_idx_for_item(s) for s in goldlabels] + goldlabels = [self.mtlmodel.sbd_tag2idx[s] for s in goldlabels] preds,devloss = self.shingle_predict(sents,goldlabels) if preds is None: - preds = [self.mtlmodel.sbdtagset.get_idx_for_item("O")] * len(goldlabels) + preds = [self.mtlmodel.sbd_tag2idx["O"] for s in goldlabels] * len(goldlabels) + invalidlabelscount += len(goldlabels) devloss = 0 totaldevloss += devloss allpreds.extend(preds) allgold.extend(goldlabels) - f1 = f1_score(allgold,allpreds) - precision = precision_score(allgold,allpreds) - recall = recall_score(allgold,allpreds) + print ('dev inference') + print (time() - start) + + goldspans = [] + predspans = [] + goldstartindex = 0 + predstartindex = 0 + for i in range(0,len(allgold)): + if allgold[i] == 1: #B-SENT + goldspans.append(UDSpan(goldstartindex,i)) + goldstartindex = i + if allpreds[i] == 1: + predspans.append(UDSpan(predstartindex,i)) + predstartindex = i + + + scores = spans_score(goldspans,predspans) + + print ('invalid labels:' + str(invalidlabelscount)) self.writer.add_scalar("dev_loss",round(totaldevloss/len(devdata),2),int(epoch / self.evalstep)) - self.writer.add_scalar("dev_f1", round(f1,2), int(epoch / self.evalstep)) - self.writer.add_scalar("dev_precision", round(precision, 2), int(epoch / self.evalstep)) - self.writer.add_scalar("dev_recall", round(recall, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("dev_f1", round(scores.f1,2), int(epoch / self.evalstep)) + self.writer.add_scalar("dev_precision", round(scores.precision, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("dev_recall", round(scores.recall, 2), int(epoch / self.evalstep)) - print ('dev f1:' + str(f1)) - print('dev precision:' + str(precision)) - print('dev recall:' + str(recall)) + print ('dev f1:' + str(scores.f1)) + print('dev precision:' + str(scores.precision)) + print('dev recall:' + str(scores.recall)) print ('\n') From 12005c7b39e1572e35c5ea51bcc82daa3ba14b25 Mon Sep 17 00:00:00 2001 From: nitin Date: Tue, 26 Jul 2022 04:51:36 +0800 Subject: [PATCH 09/32] shingle the embedding --- .../lib/multitask_sentsplitter_postagger.py | 426 ++++++++++-------- 1 file changed, 228 insertions(+), 198 deletions(-) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index 57a9518..6836f25 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -16,7 +16,7 @@ from time import time os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256" -SAMPLE_SIZE = 32 +SAMPLE_SIZE = 16 def spans_score(gold_spans, system_spans): @@ -74,53 +74,60 @@ def forward(self, x): class MTLModel(nn.Module): - def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='gru',ffdim=512,batchsize=SAMPLE_SIZE,transformernumlayers=6,nhead=8,sequencelength=64): + def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers=2,sbdrnnbidirectional=True,posrnnbidirectional=True,sbdrnndropout=0.3,posrnndropout=0.3,sbdencodertype='lstm',posencodertype='lstm',sbdffdim=512,posffdim=512,batchsize=SAMPLE_SIZE,sbdtransformernumlayers=6,sbdnhead=8,sequencelength=128): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + # tagsets - amend labels here self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? self.sbd_tag2idx = {'B-SENT': 1,'O': 0} - """ - self.sbdtagset = Dictionary() - for key in self.sbd_tag2idx.keys(): - self.sbdtagset.add_item(key.strip()) - self.sbdtagset.add_item("") - self.sbdtagset.add_item("") - """ + # POS tagset in Dictionary object for Flair CRF + self.postagsetcrf = Dictionary() + for key in self.postagset.keys(): + self.postagsetcrf.add_item(key.strip()) + self.postagsetcrf.add_item("") + self.postagsetcrf.add_item("") + # shared hyper-parameters self.sequence_length = sequencelength self.batch_size = batchsize - self.encodertype = encodertype + # Embedding parameters and model self.tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base') self.model = BertModel.from_pretrained('onlplab/alephbert-base').to(self.device) - - # Bi-LSTM Encoder - self.embeddingdim = 768 * 1 # based on BERT model with Flair layers - self.rnndim = rnndim - self.rnnnumlayers = rnnnumlayers - self.rnnbidirectional = rnnbidirectional - self.rnndropout = rnndropout - - if encodertype == 'lstm': - self.encoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.rnndim // 2, - num_layers=self.rnnnumlayers, bidirectional=self.rnnbidirectional, - dropout=self.rnndropout,batch_first=True).to(self.device) - elif encodertype == 'gru': - self.encoder = nn.GRU(input_size=self.embeddingdim, hidden_size=self.rnndim // 2, - num_layers=self.rnnnumlayers, bidirectional=self.rnnbidirectional, - dropout=self.rnndropout,batch_first=True).to(self.device) - elif self.encodertype == 'transformer': - self.transformernumlayers = transformernumlayers - self.nhead = nhead - self.encoderlayer = nn.TransformerEncoderLayer(d_model= self.embeddingdim,nhead=nhead).to(self.device) - self.encoder = nn.TransformerEncoder(self.encoderlayer,num_layers=self.transformernumlayers).to(self.device) - self.posencoder = PositionalEncoding(d_model=self.embeddingdim).to(self.device) + self.embeddingdim = 768 + + # Bi-LSTM Encoder for SBD + self.sbdrnndim = sbdrnndim + self.sbdrnnnumlayers = sbdrnnnumlayers + self.sbdrnnbidirectional = sbdrnnbidirectional + self.sbdrnndropout = sbdrnndropout + + #Bi-LSTM Encoder for POS tagging + self.posrnndim = posrnndim + self.posrnnnumlayers = posrnnnumlayers + self.posrnnbidirectional = posrnnbidirectional + self.posrnndropout = posrnndropout + + if sbdencodertype == 'lstm': + self.sbdencoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.sbdrnndim // 2, + num_layers=self.sbdrnnnumlayers, bidirectional=self.sbdrnnbidirectional, + dropout=self.sbdrnndropout,batch_first=True).to(self.device) + elif sbdencodertype == 'gru': + self.sbdencoder = nn.GRU(input_size=self.embeddingdim, hidden_size=self.sbdrnndim // 2, + num_layers=self.sbdrnnnumlayers, bidirectional=self.sbdrnnbidirectional, + dropout=self.sbdrnndropout,batch_first=True).to(self.device) + elif sbdencodertype == 'transformer': + self.sbdtransformernumlayers = sbdtransformernumlayers + self.sbdnhead = sbdnhead + self.sbdencoderlayer = nn.TransformerEncoderLayer(d_model= self.embeddingdim,nhead=self.sbdnhead).to(self.device) + self.sbdencoder = nn.TransformerEncoder(self.sbdencoderlayer,num_layers=self.sbdtransformernumlayers).to(self.device) + self.sbdposencoder = PositionalEncoding(d_model=self.embeddingdim).to(self.device) # param init - for name, param in self.encoder.named_parameters(): + for name, param in self.sbdencoder.named_parameters(): try: if 'bias' in name: nn.init.constant_(param,0.0) @@ -129,28 +136,62 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 except ValueError as ex: nn.init.constant_(param,0.0) + if posencodertype == 'lstm': + self.posencoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.posrnndim // 2, + num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, + dropout=self.posrnndropout,batch_first=True).to(self.device) + elif posencodertype == 'gru': + self.posencoder = nn.GRU(input_size=self.embeddingdim, hidden_size=self.posrnndim // 2, + num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, + dropout=self.posrnndropout,batch_first=True).to(self.device) + + # param init + for name, param in self.posencoder.named_parameters(): + try: + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_uniform_(param) + except ValueError as ex: + nn.init.constant_(param, 0.0) + self.relu = nn.ReLU() # Intermediate feedforward layer - self.ffdim = ffdim - if self.encodertype == 'transformer': - self.fflayer = nn.Linear(in_features=self.embeddingdim, out_features=self.ffdim).to(self.device) + self.sbdffdim = sbdffdim + if sbdencodertype == 'transformer': + self.sbdfflayer = nn.Linear(in_features=self.embeddingdim, out_features=self.sbdffdim).to(self.device) else: - self.fflayer = nn.Linear(in_features=self.rnndim, out_features=self.ffdim).to(self.device) + self.sbdfflayer = nn.Linear(in_features=self.sbdrnndim, out_features=self.sbdffdim).to(self.device) + + # param init + for name, param in self.sbdfflayer.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + + # Intermediate feedforward layer + self.posffdim = posffdim + self.posfflayer = nn.Linear(in_features=self.posrnndim, out_features=self.posffdim).to(self.device) # param init - for name, param in self.fflayer.named_parameters(): + for name, param in self.posfflayer.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_normal_(param) # Label space for the pos tagger - # TODO: CRF? - #self.hidden2postag = nn.Linear(in_features=self.ffdim,out_features=len(self.postagset.keys())).to(self.device) + self.hidden2postag = nn.Linear(in_features=self.posffdim,out_features=len(self.postagset.keys())).to(self.device) + for name, param in self.hidden2postag.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) # Label space for sent splitter - self.hidden2sbd = nn.Linear(in_features=self.ffdim,out_features=len(self.sbd_tag2idx.keys())).to(self.device) + self.hidden2sbd = nn.Linear(in_features=self.sbdffdim,out_features=len(self.sbd_tag2idx.keys())).to(self.device) # param init for name, param in self.hidden2sbd.named_parameters(): @@ -160,16 +201,83 @@ def __init__(self,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3 nn.init.xavier_normal_(param) self.sigmoid = nn.Sigmoid() + self.dropout = nn.Dropout(p=0.5) + self.embeddingdropout = nn.Dropout(p=0.1) + + self.poscrf = CRF(self.postagsetcrf,len(self.postagsetcrf),init_from_state_dict=False) # TODO: parameterize + self.viterbidecoder = ViterbiDecoder(self.postagsetcrf) + + self.stride_size = 10 + self.sbdencodertype = sbdencodertype + + def shingle(self,toks,labels=None): + """ + Returns the span embeddings, labelspans, and 'final mapping' + """ + spans = [] + labelspans = [] + final_mapping = {} + + # Hack tokens up into overlapping shingles + #wraparound = toks[-self.stride_size:] + toks + toks[: self.mtlmodel.sequence_length] + wraparound = torch.cat((toks[-self.stride_size:],toks,toks[: self.sequence_length]),dim=0) + if labels: + labelwraparound = labels[-self.stride_size:] + labels + labels[: self.sequence_length] + idx = 0 + mapping = defaultdict(set) + snum = 0 + while idx < len(toks): + if idx + self.sequence_length < len(wraparound): + span = wraparound[idx: idx + self.sequence_length] + if labels: + labelspan = labelwraparound[idx: idx + self.sequence_length] + else: + span = wraparound[idx:] + if labels: + labelspan = labelwraparound[idx:] + + spans.append(span) + if labels: + labelspans.append(labelspan) + + for i in range(idx - self.stride_size, idx + self.sequence_length - self.stride_size): + # start, end, snum + if i >= 0 and i < len(toks): + mapping[i].add( + (idx - self.stride_size, idx + self.sequence_length - self.stride_size, snum)) + idx += self.stride_size + snum += 1 + + for idx in mapping: + best = self.sequence_length + for m in mapping[idx]: + start, end, snum = m + dist_to_end = end - idx + dist_to_start = idx - start + delta = abs(dist_to_end - dist_to_start) + if delta < best: + best = delta + final_mapping[idx] = (snum, idx - start) # Get sentence number and position in sentence - #self.sbdcrf = CRF(self.sbdtagset,len(self.sbdtagset),init_from_state_dict=False) # TODO: parameterize - #self.viterbidecoder = ViterbiDecoder(self.sbdtagset) + spans = torch.stack(spans) + return spans,labelspans,final_mapping + def forward(self,data,mode='train'): - def forward(self,data): + badrecords = [] # stores records where AlephBERT's tokenization messed up the sentence's sequence length, and removes these sentences from the batch. - badrecords = [] - data = [d.split() for d in data] # for AlephBERT - tokens = self.tokenizer(data,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. Otherwise its own subword tokenization messes things up. + if mode == 'train': # training is on a batch, so 3D tensor + sentences = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] + sbdlabels = [[self.sbd_tag2idx[s.split('\t')[2].strip()] for s in sls] for sls in data] + elif mode == 'dev': # inference is on a single record, 2D tensor + sentences = [' '.join([s.split('\t')[0].strip() for s in data])] + sbdlabels = [s.split('\t')[2].strip() for s in data] + else: # test - has no labels, and 2D tensor single record + sentences = [s.split('\t')[0].strip() for s in data] + sbdlabels = None + + sentences = [d.split() for d in sentences] # for AlephBERT + tokens = self.tokenizer(sentences,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. Otherwise its own subword tokenization messes things up. embeddings = self.model(**tokens) embeddings = embeddings[0] @@ -202,8 +310,6 @@ def forward(self,data): slice = embeddings[k][indices[0]:indices[-1] + 1] slice = torch.mean(input=slice,dim=0,keepdim=False) emb.append(slice) - - try: assert len(emb) == self.sequence_length # averaging was correct and aligns with the labels except AssertionError: @@ -214,47 +320,65 @@ def forward(self,data): emb = torch.stack(emb) avgembeddings.append(emb) + badrecords = sorted(badrecords,reverse=True) + if len(avgembeddings) > 0: avgembeddings = torch.stack(avgembeddings) + for record in badrecords: + sbdlabels.pop(record) else: - return None,badrecords + return None,None,None #print ('average embeddings') #print (time() - start) - if self.encodertype in ('lstm','gru'): - feats, _ = self.encoder(avgembeddings) + if mode != 'train': + # squeeze the embedding, as it's a single sentence + avgembeddings = torch.squeeze(avgembeddings) + finalembeddings,finallabels,finalmapping = self.shingle(avgembeddings,sbdlabels) + if mode != 'test': + finallabels = [[self.sbd_tag2idx[s] for s in sls] for sls in finallabels] else: - feats = self.posencoder(avgembeddings) - feats = self.encoder(feats) + finalembeddings = avgembeddings + finallabels = sbdlabels + finalmapping = None + finalembeddings = self.embeddingdropout(finalembeddings) + + # SBD encoder and labels + if self.sbdencodertype in ('lstm','gru'): + feats, _ = self.sbdencoder(finalembeddings) + else: + feats = self.sbdposencoder(finalembeddings) + feats = self.sbdencoder(feats) # Intermediate Feedforward layer - feats = self.fflayer(feats) + feats = self.sbdfflayer(feats) feats = self.relu(feats) - - # logits for pos - #poslogits = self.hidden2postag(feats) - #poslogits = poslogits.permute(0,2,1) + feats = self.dropout(feats) # logits for sbd sbdlogits = self.hidden2sbd(feats) - sbdlogits = sbdlogits.permute(0,2,1) - #sbdlogits = self.sbdcrf(sbdlogits) + #sbdlogits = sbdlogits.permute(0, 2, 1) + + # logits for pos + #poslogits = self.hidden2postag(feats) + #poslogits = poslogits.permute(0,2,1) del embeddings + del finalembeddings del avgembeddings del feats torch.cuda.empty_cache() - return sbdlogits,badrecords + return sbdlogits,finallabels,finalmapping # returns the logits class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnndim=512,rnnnumlayers=2,rnnbidirectional=True,rnndropout=0.3,encodertype='gru',ffdim=512,learningrate = 0.001): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=512,sbdrnnnumlayers=2,sbdrnnbidirectional=True,sbdrnndropout=0.3,sbdencodertype='lstm',sbdffdim=512,learningrate = 0.001): - self.mtlmodel = MTLModel(rnndim,rnnnumlayers,rnnbidirectional,rnndropout,encodertype,ffdim) + self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdrnndropout=sbdrnndropout,sbdencodertype=sbdencodertype,sbdffdim=sbdffdim) if trainflag == True: @@ -283,19 +407,18 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,rnnd self.learningrate = learningrate # Loss for pos tagging - #self.postagloss = nn.CrossEntropyLoss() - #self.postagloss.to(self.device) + self.postagloss = ViterbiLoss(self.mtlmodel.postagsetcrf) + self.postagloss.to(self.device) - #self.sbdloss = ViterbiLoss(self.mtlmodel.sbdtagset) + # Loss for sentence splitting self.sbdloss = nn.CrossEntropyLoss(weight=torch.FloatTensor([1,3])) self.sbdloss.to(self.device) - self.optimizer = torch.optim.AdamW(list(self.mtlmodel.encoder.parameters()) + list(self.mtlmodel.fflayer.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) - self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[150,750],gamma=0.1) + self.optimizer = torch.optim.AdamW(list(self.mtlmodel.sbdencoder.parameters()) + list(self.mtlmodel.sbdfflayer.parameters()) + + list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) + #self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[150,400],gamma=0.1) self.evalstep = 20 - self.stride_size = 20 - self.set_seed(42) def set_seed(self, seed): @@ -303,106 +426,6 @@ def set_seed(self, seed): random.seed(seed) torch.manual_seed(seed) - - def shingle_predict(self,toks,labels=None,type='sbd'): - - """ - Shingles data, then predicts the tag. Applies to dev and test sets only - pass labels if they exist e.g for dev / test Otherwise it's inference on new data. - pass type for the type of label, sbd or pos - """ - - spans = [] - if labels: - labelspans = [] - final_mapping = {} - # Hack tokens up into overlapping shingles - wraparound = toks[-self.stride_size:] + toks + toks[: self.mtlmodel.sequence_length] - if labels: - labelwraparound = labels[-self.stride_size:] + labels + labels[: self.mtlmodel.sequence_length] - idx = 0 - mapping = defaultdict(set) - snum = 0 - while idx < len(toks): - if idx + self.mtlmodel.sequence_length < len(wraparound): - span = wraparound[idx: idx + self.mtlmodel.sequence_length] - if labels: - labelspan = labelwraparound[idx: idx + self.mtlmodel.sequence_length] - else: - span = wraparound[idx:] - if labels: - labelspan = labelwraparound[idx:] - sent = " ".join(span) - spans.append(sent) - if labels: - labelspans.append(labelspan) - - for i in range(idx - self.stride_size, idx + self.mtlmodel.sequence_length - self.stride_size): - # start, end, snum - if i >= 0 and i < len(toks): - mapping[i].add((idx - self.stride_size, idx + self.mtlmodel.sequence_length - self.stride_size, snum)) - idx += self.stride_size - snum += 1 - - for idx in mapping: - best = self.mtlmodel.sequence_length - for m in mapping[idx]: - start, end, snum = m - dist_to_end = end - idx - dist_to_start = idx - start - delta = abs(dist_to_end - dist_to_start) - if delta < best: - best = delta - final_mapping[idx] = (snum, idx - start) # Get sentence number and position in sentence - - self.mtlmodel.batch_size = len(spans) - - # get the loss - sbdlogits,badrecords = self.mtlmodel(spans) - - badrecords = sorted(badrecords,reverse=True) - for record in badrecords: - labelspans.pop(record) - spans.pop(record) - self.mtlmodel.batch_size -= 1 - - if len(spans) == 0: - return None, None - - #labelspans = [label for span in labelspans for label in span] - labelspans = torch.LongTensor(labelspans).to(self.device) - - #lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size - #lengths = torch.LongTensor(lengths).to(self.device) - - #score = (sbdlogits, lengths, self.mtlmodel.sbdcrf.transitions) - #sbdloss = self.sbdloss(score, labelspans) - sbdloss = self.sbdloss(sbdlogits,labelspans) - - # now get the predictions - #sents = [] - #for span in spans: - # sents.append(Sentence(span)) - - #predictions, _ = self.mtlmodel.viterbidecoder.decode(score,False,sents) - predictions = torch.argmax(sbdlogits,dim=1) - - labels = [] - for idx in final_mapping: - snum, position = final_mapping[idx] - #label = self.mtlmodel.sbdtagset.get_idx_for_item(predictions[snum][position][0]) - label = predictions[snum][position] - - labels.append(label) - - del sbdlogits - del labelspans - - torch.cuda.empty_cache() - - return labels,sbdloss.item() - - def train(self): def read_file(mode='train'): @@ -430,7 +453,7 @@ def read_file(mode='train'): #lines = list(reversed(lines)) # hebrew is right to left... # shingle it here to get more training data - for idx in range(0,len(lines),self.mtlmodel.sequence_length - self.stride_size): + for idx in range(0,len(lines),self.mtlmodel.sequence_length - self.mtlmodel.stride_size): if idx + self.mtlmodel.sequence_length >= len(lines): slice = lines[idx:len(lines)] dataset.append(slice) @@ -441,7 +464,7 @@ def read_file(mode='train'): return dataset - epochs = 1000 + epochs = 1500 trainingdata = read_file() devdata = read_file(mode='dev') @@ -455,32 +478,24 @@ def read_file(mode='train'): data = [datum for datum in data if len(datum) == self.mtlmodel.sequence_length] self.mtlmodel.batch_size = len(data) - sents = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] - - sbdlogits, badrecords = self.mtlmodel(sents) - badrecords = sorted(badrecords, reverse=True) - - sbdtags = [[s.split('\t')[2].strip() for s in sls] for sls in data] - for record in badrecords: - sbdtags.pop(record) - self.mtlmodel.batch_size -= 1 + sbdlogits, sbdlabels, badrecords = self.mtlmodel(data) + sbdtags = torch.LongTensor(sbdlabels).to(self.device) - #sbdtags = torch.tensor([self.mtlmodel.sbdtagset.get_idx_for_item(s) for sbd in sbdtags for s in sbd]) - sbdtags = torch.tensor([[self.mtlmodel.sbd_tag2idx[s] for s in sbd] for sbd in sbdtags]).to(self.device) + lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size + lengths = torch.LongTensor(lengths).to(self.device) + #scores = (poslogits,lengths,self.mtlmodel.poscrf.transitions) + #sbdloss = self.sbdloss - #lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size - #lengths = torch.LongTensor(lengths).to(self.device) - #scores = (sbdlogits,lengths,self.mtlmodel.sbdcrf.transitions) - - #sbdloss = self.sbdloss(scores,sbdtags) + sbdlogits = sbdlogits.permute(0,2,1) sbdloss = self.sbdloss(sbdlogits,sbdtags) + #posloss = self.postagloss(scores,postags) #mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? #mtlloss.backward() sbdloss.backward() self.optimizer.step() - self.scheduler.step() + #self.scheduler.step() #self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) self.writer.add_scalar('train_sbd_loss', sbdloss.item(), epoch) @@ -489,23 +504,38 @@ def read_file(mode='train'): if epoch % self.evalstep == 0: self.mtlmodel.eval() - start = time() + #start = time() with torch.no_grad(): totaldevloss = 0 allpreds = [] allgold = [] invalidlabelscount = 0 + for slice in devdata: - sents = [s.split('\t')[0].strip() for s in slice] + preds = [] + goldlabels = [s.split('\t')[2].strip() for s in slice] - #goldlabels = [self.mtlmodel.sbdtagset.get_idx_for_item(s) for s in goldlabels] goldlabels = [self.mtlmodel.sbd_tag2idx[s] for s in goldlabels] - preds,devloss = self.shingle_predict(sents,goldlabels) - if preds is None: - preds = [self.mtlmodel.sbd_tag2idx["O"] for s in goldlabels] * len(goldlabels) + sbdlogits, sbdlabels, finalmapping = self.mtlmodel(slice,mode='dev') + + if sbdlabels is not None: + # get the predictions - on non-shingled data + for idx in finalmapping: + snum, position = finalmapping[idx] + label = torch.argmax(sbdlogits[snum][position]).item() + + preds.append(label) + + # get the loss - on 'shingled' data + sbdlogits = sbdlogits.permute(0,2,1) + sbdtags = torch.LongTensor(sbdlabels).to(self.device) + devloss = self.sbdloss(sbdlogits, sbdtags).item() + + else: + preds = [self.mtlmodel.sbd_tag2idx["O"] for _ in goldlabels] invalidlabelscount += len(goldlabels) devloss = 0 @@ -513,8 +543,8 @@ def read_file(mode='train'): allpreds.extend(preds) allgold.extend(goldlabels) - print ('dev inference') - print (time() - start) + #print ('dev inference') + #print (time() - start) goldspans = [] predspans = [] From ccc763a0b79d4239d65ec2150c27afe57f70454c Mon Sep 17 00:00:00 2001 From: nitin Date: Wed, 27 Jul 2022 01:21:46 +0800 Subject: [PATCH 10/32] pos tagging in MTL - initial --- .../lib/multitask_sentsplitter_postagger.py | 208 ++++++++++++------ 1 file changed, 144 insertions(+), 64 deletions(-) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index 6836f25..eb4c03f 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -6,13 +6,14 @@ import random import math -from flair.data import Sentence, Dictionary +from flair.data import Dictionary, Sentence from transformers import BertModel,BertTokenizerFast from random import sample from collections import defaultdict from lib.crfutils.crf import CRF from lib.crfutils.viterbi import ViterbiDecoder,ViterbiLoss + from time import time os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256" @@ -33,7 +34,6 @@ def spans_score(gold_spans, system_spans): return Score(len(gold_spans), len(system_spans), correct) - class Score: def __init__(self, gold_total, system_total, correct, aligned_total=None): self.correct = correct @@ -137,11 +137,11 @@ def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers= nn.init.constant_(param,0.0) if posencodertype == 'lstm': - self.posencoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.posrnndim // 2, + self.posencoder = nn.LSTM(input_size=self.embeddingdim + 1, hidden_size=self.posrnndim // 2, num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, dropout=self.posrnndropout,batch_first=True).to(self.device) elif posencodertype == 'gru': - self.posencoder = nn.GRU(input_size=self.embeddingdim, hidden_size=self.posrnndim // 2, + self.posencoder = nn.GRU(input_size=self.embeddingdim + 1, hidden_size=self.posrnndim // 2, num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, dropout=self.posrnndropout,batch_first=True).to(self.device) @@ -183,7 +183,7 @@ def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers= nn.init.xavier_normal_(param) # Label space for the pos tagger - self.hidden2postag = nn.Linear(in_features=self.posffdim,out_features=len(self.postagset.keys())).to(self.device) + self.hidden2postag = nn.Linear(in_features=self.posffdim,out_features=len(self.postagsetcrf)).to(self.device) for name, param in self.hidden2postag.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) @@ -207,8 +207,15 @@ def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers= self.poscrf = CRF(self.postagsetcrf,len(self.postagsetcrf),init_from_state_dict=False) # TODO: parameterize self.viterbidecoder = ViterbiDecoder(self.postagsetcrf) + for name, param in self.poscrf.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + self.stride_size = 10 self.sbdencodertype = sbdencodertype + self.posencodertype = posencodertype def shingle(self,toks,labels=None): """ @@ -219,7 +226,6 @@ def shingle(self,toks,labels=None): final_mapping = {} # Hack tokens up into overlapping shingles - #wraparound = toks[-self.stride_size:] + toks + toks[: self.mtlmodel.sequence_length] wraparound = torch.cat((toks[-self.stride_size:],toks,toks[: self.sequence_length]),dim=0) if labels: labelwraparound = labels[-self.stride_size:] + labels + labels[: self.sequence_length] @@ -260,6 +266,7 @@ def shingle(self,toks,labels=None): final_mapping[idx] = (snum, idx - start) # Get sentence number and position in sentence spans = torch.stack(spans) + return spans,labelspans,final_mapping def forward(self,data,mode='train'): @@ -269,12 +276,15 @@ def forward(self,data,mode='train'): if mode == 'train': # training is on a batch, so 3D tensor sentences = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] sbdlabels = [[self.sbd_tag2idx[s.split('\t')[2].strip()] for s in sls] for sls in data] - elif mode == 'dev': # inference is on a single record, 2D tensor + poslabels = [[self.postagsetcrf.get_idx_for_item(s.split('\t')[1].strip()) for s in sls] for sls in data] + elif mode == 'dev': # inference is on a single record sentences = [' '.join([s.split('\t')[0].strip() for s in data])] sbdlabels = [s.split('\t')[2].strip() for s in data] + poslabels = [self.postagsetcrf.get_idx_for_item(s.split('\t')[1].strip()) for s in data] else: # test - has no labels, and 2D tensor single record - sentences = [s.split('\t')[0].strip() for s in data] + sentences = [' '.join([s.split('\t')[0].strip() for s in data])] sbdlabels = None + poslabels = None sentences = [d.split() for d in sentences] # for AlephBERT tokens = self.tokenizer(sentences,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. Otherwise its own subword tokenization messes things up. @@ -303,7 +313,7 @@ def forward(self,data,mode='train'): indices = [j for j,x in enumerate(tokens.encodings[k].words) if x == i] if len(indices) == 0: # This strange case needs to be handled. - emb.append(torch.zeros(768,device=self.device)) + emb.append(torch.zeros(self.embeddingdim,device=self.device)) elif len(indices) == 1: # no need to average emb.append(embeddings[k][indices[0]]) else: # needs to aggregate - average @@ -326,8 +336,10 @@ def forward(self,data,mode='train'): avgembeddings = torch.stack(avgembeddings) for record in badrecords: sbdlabels.pop(record) + poslabels.pop(record) + self.batch_size -= 1 else: - return None,None,None + return None,None,None,None,None #print ('average embeddings') #print (time() - start) @@ -335,12 +347,13 @@ def forward(self,data,mode='train'): if mode != 'train': # squeeze the embedding, as it's a single sentence avgembeddings = torch.squeeze(avgembeddings) - finalembeddings,finallabels,finalmapping = self.shingle(avgembeddings,sbdlabels) + finalembeddings,finalsbdlabels,finalmapping = self.shingle(avgembeddings,sbdlabels) + avgembeddings = torch.unsqueeze(avgembeddings,dim=0) if mode != 'test': - finallabels = [[self.sbd_tag2idx[s] for s in sls] for sls in finallabels] + finalsbdlabels = [[self.sbd_tag2idx[s] for s in sls] for sls in finalsbdlabels] else: finalembeddings = avgembeddings - finallabels = sbdlabels + finalsbdlabels = sbdlabels finalmapping = None finalembeddings = self.embeddingdropout(finalembeddings) @@ -360,11 +373,37 @@ def forward(self,data,mode='train'): # logits for sbd sbdlogits = self.hidden2sbd(feats) - #sbdlogits = sbdlogits.permute(0, 2, 1) + #get the sbd predictions as input to the POS encoder + if mode == 'train': + sbdpreds = torch.argmax(sbdlogits,dim=2,keepdim=True) + else: + sbdpreds = [] + for idx in finalmapping: + snum, position = finalmapping[idx] + label = torch.argmax(sbdlogits[snum][position]).item() + sbdpreds.append(label) + sbdpreds = torch.LongTensor(sbdpreds) + sbdpreds = torch.unsqueeze(sbdpreds,dim=0) + sbdpreds = torch.unsqueeze(sbdpreds, dim=2) + sbdpreds = sbdpreds.to(self.device) + + posembeddings = torch.cat((avgembeddings,sbdpreds),dim=2) + if mode in ('dev','test'): + sbdpreds = torch.squeeze(sbdpreds,dim=2) + sbdpreds = torch.squeeze(sbdpreds, dim=0) + sbdpreds = sbdpreds.tolist() + else: + sbdpreds = None + + if self.posencodertype in ('lstm','gru'): + feats,_ = self.posencoder(posembeddings) # logits for pos - #poslogits = self.hidden2postag(feats) - #poslogits = poslogits.permute(0,2,1) + feats = self.posfflayer(feats) + feats = self.relu(feats) + feats = self.dropout(feats) + poslogits = self.hidden2postag(feats) + poslogits = self.poscrf(poslogits) del embeddings del finalembeddings @@ -373,7 +412,7 @@ def forward(self,data,mode='train'): torch.cuda.empty_cache() - return sbdlogits,finallabels,finalmapping # returns the logits + return sbdlogits,finalsbdlabels,sbdpreds, poslogits,poslabels # returns the logits and labels class Tagger(): def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=512,sbdrnnnumlayers=2,sbdrnnbidirectional=True,sbdrnndropout=0.3,sbdencodertype='lstm',sbdffdim=512,learningrate = 0.001): @@ -415,7 +454,9 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr self.sbdloss.to(self.device) self.optimizer = torch.optim.AdamW(list(self.mtlmodel.sbdencoder.parameters()) + list(self.mtlmodel.sbdfflayer.parameters()) + - list(self.mtlmodel.hidden2sbd.parameters()), lr=learningrate) + list(self.mtlmodel.hidden2sbd.parameters()) + list(self.mtlmodel.posencoder.parameters()) + list(self.mtlmodel.posfflayer.parameters()) + + list(self.mtlmodel.hidden2postag.parameters()) + list(self.mtlmodel.poscrf.parameters()), lr=learningrate) + #self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[150,400],gamma=0.1) self.evalstep = 20 @@ -478,28 +519,30 @@ def read_file(mode='train'): data = [datum for datum in data if len(datum) == self.mtlmodel.sequence_length] self.mtlmodel.batch_size = len(data) - sbdlogits, sbdlabels, badrecords = self.mtlmodel(data) + sbdlogits, sbdlabels, _, poslogits,poslabels = self.mtlmodel(data) + sbdtags = torch.LongTensor(sbdlabels).to(self.device) + sbdlogits = sbdlogits.permute(0,2,1) + sbdloss = self.sbdloss(sbdlogits,sbdtags) lengths = [self.mtlmodel.sequence_length] * self.mtlmodel.batch_size lengths = torch.LongTensor(lengths).to(self.device) + scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) - #scores = (poslogits,lengths,self.mtlmodel.poscrf.transitions) - #sbdloss = self.sbdloss - - sbdlogits = sbdlogits.permute(0,2,1) - sbdloss = self.sbdloss(sbdlogits,sbdtags) - #posloss = self.postagloss(scores,postags) + # unwrap the pos tags into one long list first + postags = [p for pos in poslabels for p in pos] + postags = torch.LongTensor(postags).to(self.device) + posloss = self.postagloss(scores,postags) - #mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? - #mtlloss.backward() - sbdloss.backward() + mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? + mtlloss.backward() + #sbdloss.backward() self.optimizer.step() #self.scheduler.step() - #self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) + self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) self.writer.add_scalar('train_sbd_loss', sbdloss.item(), epoch) - #self.writer.add_scalar('train_joint_loss', mtlloss.item(), epoch) + self.writer.add_scalar('train_joint_loss', mtlloss.item(), epoch) if epoch % self.evalstep == 0: @@ -507,72 +550,109 @@ def read_file(mode='train'): #start = time() with torch.no_grad(): - totaldevloss = 0 - allpreds = [] - allgold = [] + totalsbddevloss = 0 + totalposdevloss = 0 + + allsbdpreds = [] + allsbdgold = [] + allpospreds = [] + allposgold = [] + invalidlabelscount = 0 for slice in devdata: - preds = [] + sentence = ' '.join([s.split('\t')[0].strip() for s in slice]) + sentence = Sentence(sentence,use_tokenizer=False) - goldlabels = [s.split('\t')[2].strip() for s in slice] - goldlabels = [self.mtlmodel.sbd_tag2idx[s] for s in goldlabels] + goldsbdlabels = [s.split('\t')[2].strip() for s in slice] + goldsbdlabels = [self.mtlmodel.sbd_tag2idx[s] for s in goldsbdlabels] + goldposlabels = [s.split('\t')[1].strip() for s in slice] + goldposlabels = [self.mtlmodel.postagsetcrf.get_idx_for_item(s) for s in goldposlabels] - sbdlogits, sbdlabels, finalmapping = self.mtlmodel(slice,mode='dev') + sbdlogits, sbdlabels, sbdpreds, poslogits, poslabels = self.mtlmodel(slice,mode='dev') if sbdlabels is not None: - # get the predictions - on non-shingled data - for idx in finalmapping: - snum, position = finalmapping[idx] - label = torch.argmax(sbdlogits[snum][position]).item() - preds.append(label) + # get the pos predictions + lengths = [self.mtlmodel.sequence_length] + lengths = torch.LongTensor(lengths).to(self.device) + scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) + pospreds = self.mtlmodel.viterbidecoder.decode(scores,False,[sentence]) + pospreds = [self.mtlmodel.postagsetcrf.get_idx_for_item(p[0]) for pr in pospreds[0] for p in pr] - # get the loss - on 'shingled' data + # get the sbd loss sbdlogits = sbdlogits.permute(0,2,1) sbdtags = torch.LongTensor(sbdlabels).to(self.device) - devloss = self.sbdloss(sbdlogits, sbdtags).item() + sbddevloss = self.sbdloss(sbdlogits, sbdtags).item() + + # get the pos loss + postags = torch.LongTensor(poslabels) + postags = postags.to(self.device) + posdevloss = self.postagloss(scores,postags).item() else: - preds = [self.mtlmodel.sbd_tag2idx["O"] for _ in goldlabels] - invalidlabelscount += len(goldlabels) - devloss = 0 + sbdpreds = [self.mtlmodel.sbd_tag2idx["O"] for _ in goldsbdlabels] + pospreds = [self.mtlmodel.postagset['X'] for _ in goldsbdlabels] + invalidlabelscount += len(goldsbdlabels) + sbddevloss = 0 + posdevloss = 0 + - totaldevloss += devloss - allpreds.extend(preds) - allgold.extend(goldlabels) + totalsbddevloss += sbddevloss + totalposdevloss += posdevloss - #print ('dev inference') - #print (time() - start) + allsbdpreds.extend(sbdpreds) + allsbdgold.extend(goldsbdlabels) + allpospreds.extend(pospreds) + allposgold.extend(goldposlabels) goldspans = [] predspans = [] goldstartindex = 0 predstartindex = 0 - for i in range(0,len(allgold)): - if allgold[i] == 1: #B-SENT + + for i in range(0,len(allsbdgold)): + if allsbdgold[i] == 1: #B-SENT goldspans.append(UDSpan(goldstartindex,i)) goldstartindex = i - if allpreds[i] == 1: + if allsbdpreds[i] == 1: predspans.append(UDSpan(predstartindex,i)) predstartindex = i + sbdscores = spans_score(goldspans,predspans) - - scores = spans_score(goldspans,predspans) + correctpos = sum([1 if p == g else 0 for p,g in zip(allpospreds,allposgold)]) + posscores = Score(len(allposgold),len(allpospreds),correctpos,len(allpospreds)) print ('invalid labels:' + str(invalidlabelscount)) + print('\n') - self.writer.add_scalar("dev_loss",round(totaldevloss/len(devdata),2),int(epoch / self.evalstep)) - self.writer.add_scalar("dev_f1", round(scores.f1,2), int(epoch / self.evalstep)) - self.writer.add_scalar("dev_precision", round(scores.precision, 2), int(epoch / self.evalstep)) - self.writer.add_scalar("dev_recall", round(scores.recall, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("mtl_dev_loss", round((totalsbddevloss / len(devdata) + (totalposdevloss / len(devdata))), 2), + int(epoch / self.evalstep)) + print('mtl dev loss:' + str(round((totalsbddevloss / len(devdata) + (totalposdevloss / len(devdata))), 2))) + self.writer.add_scalar("sbd_dev_loss",round(totalsbddevloss/len(devdata),2),int(epoch / self.evalstep)) + self.writer.add_scalar("sbd_dev_f1", round(sbdscores.f1,2), int(epoch / self.evalstep)) + self.writer.add_scalar("sbd_dev_precision", round(sbdscores.precision, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("sbd_dev_recall", round(sbdscores.recall, 2), int(epoch / self.evalstep)) - print ('dev f1:' + str(scores.f1)) - print('dev precision:' + str(scores.precision)) - print('dev recall:' + str(scores.recall)) print ('\n') + self.writer.add_scalar("pos_dev_loss", round(totalposdevloss / len(devdata), 2), + int(epoch / self.evalstep)) + self.writer.add_scalar("pos_dev_f1", round(posscores.f1, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("pos_dev_precision", round(posscores.precision, 2), + int(epoch / self.evalstep)) + self.writer.add_scalar("pos_dev_recall", round(posscores.recall, 2), int(epoch / self.evalstep)) + + print ('sbd dev f1:' + str(sbdscores.f1)) + print('sbd dev precision:' + str(sbdscores.precision)) + print('sbd dev recall:' + str(sbdscores.recall)) + print ('\n') + + print('pos dev f1:' + str(posscores.f1)) + print('pos dev precision:' + str(posscores.precision)) + print('pos dev recall:' + str(posscores.recall)) + def predict(self): From 2b2d80746848d400ec30b51a2dad7245e0cd980c Mon Sep 17 00:00:00 2001 From: nitin Date: Wed, 27 Jul 2022 02:57:05 +0800 Subject: [PATCH 11/32] bugfix --- .../lib/multitask_sentsplitter_postagger.py | 128 +++++++++--------- 1 file changed, 63 insertions(+), 65 deletions(-) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index eb4c03f..839b5c0 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -13,13 +13,11 @@ from lib.crfutils.crf import CRF from lib.crfutils.viterbi import ViterbiDecoder,ViterbiLoss - from time import time os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256" SAMPLE_SIZE = 16 - def spans_score(gold_spans, system_spans): correct, gi, si = 0, 0, 0 while gi < len(gold_spans) and si < len(system_spans): @@ -271,24 +269,25 @@ def shingle(self,toks,labels=None): def forward(self,data,mode='train'): - badrecords = [] # stores records where AlephBERT's tokenization messed up the sentence's sequence length, and removes these sentences from the batch. + badrecords = [] # stores records where AlephBERT's tokenization 'messed up' the sentence's sequence length, and removes these sentences from the batch. + # Extract the sentences and labels if mode == 'train': # training is on a batch, so 3D tensor sentences = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] sbdlabels = [[self.sbd_tag2idx[s.split('\t')[2].strip()] for s in sls] for sls in data] poslabels = [[self.postagsetcrf.get_idx_for_item(s.split('\t')[1].strip()) for s in sls] for sls in data] elif mode == 'dev': # inference is on a single record sentences = [' '.join([s.split('\t')[0].strip() for s in data])] - sbdlabels = [s.split('\t')[2].strip() for s in data] + sbdlabels = [self.sbd_tag2idx[s.split('\t')[2].strip()] for s in data] poslabels = [self.postagsetcrf.get_idx_for_item(s.split('\t')[1].strip()) for s in data] else: # test - has no labels, and 2D tensor single record sentences = [' '.join([s.split('\t')[0].strip() for s in data])] sbdlabels = None poslabels = None + # Make embeddings sentences = [d.split() for d in sentences] # for AlephBERT - tokens = self.tokenizer(sentences,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. Otherwise its own subword tokenization messes things up. - + tokens = self.tokenizer(sentences,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. embeddings = self.model(**tokens) embeddings = embeddings[0] @@ -296,7 +295,7 @@ def forward(self,data,mode='train'): Average the subword embeddings This process will drop the [CLS],[SEP] and [PAD] tokens """ - #start = time() + avgembeddings = [] for k in range(0,len(tokens.encodings)): emb = [] @@ -305,7 +304,7 @@ def forward(self,data,mode='train'): try: assert maxindex == self.sequence_length - 1 # otherwise won't average correctly and align with labels except AssertionError: - print ('max index not equal sequence len. Skipping.') + print ('max index not equal sequence len. Default labels will be applied.') badrecords.append(k) continue @@ -323,7 +322,7 @@ def forward(self,data,mode='train'): try: assert len(emb) == self.sequence_length # averaging was correct and aligns with the labels except AssertionError: - print ('embedding not built correctly. Skipping') + print ('embedding not built correctly. Default labels will be applied') badrecords.append(k) continue @@ -332,63 +331,65 @@ def forward(self,data,mode='train'): badrecords = sorted(badrecords,reverse=True) - if len(avgembeddings) > 0: - avgembeddings = torch.stack(avgembeddings) - for record in badrecords: - sbdlabels.pop(record) - poslabels.pop(record) - self.batch_size -= 1 - else: - return None,None,None,None,None - - #print ('average embeddings') - #print (time() - start) + avgembeddings = torch.stack(avgembeddings) + for record in badrecords: + sbdlabels.pop(record) + poslabels.pop(record) + self.batch_size -= 1 if mode != 'train': # squeeze the embedding, as it's a single sentence avgembeddings = torch.squeeze(avgembeddings) - finalembeddings,finalsbdlabels,finalmapping = self.shingle(avgembeddings,sbdlabels) + # shingle the sentence embedding and its label, to calculate the dev loss later + sbdembeddings,finalsbdlabels,finalmapping = self.shingle(avgembeddings,sbdlabels) + # restore dimensionality for the POS tagging pipeline. avgembeddings = torch.unsqueeze(avgembeddings,dim=0) - if mode != 'test': - finalsbdlabels = [[self.sbd_tag2idx[s] for s in sls] for sls in finalsbdlabels] + else: - finalembeddings = avgembeddings + sbdembeddings = avgembeddings finalsbdlabels = sbdlabels finalmapping = None - finalembeddings = self.embeddingdropout(finalembeddings) + sbdembeddings = self.embeddingdropout(sbdembeddings) # SBD encoder and labels if self.sbdencodertype in ('lstm','gru'): - feats, _ = self.sbdencoder(finalembeddings) + feats, _ = self.sbdencoder(sbdembeddings) else: - feats = self.sbdposencoder(finalembeddings) + feats = self.sbdposencoder(sbdembeddings) feats = self.sbdencoder(feats) - # Intermediate Feedforward layer + # SBD Intermediate Feedforward layer feats = self.sbdfflayer(feats) feats = self.relu(feats) feats = self.dropout(feats) - # logits for sbd + # SBD logits sbdlogits = self.hidden2sbd(feats) #get the sbd predictions as input to the POS encoder if mode == 'train': sbdpreds = torch.argmax(sbdlogits,dim=2,keepdim=True) else: + # Predict from the shingles for SBD. + # 'Believe the span where the token is most in the middle' sbdpreds = [] for idx in finalmapping: snum, position = finalmapping[idx] label = torch.argmax(sbdlogits[snum][position]).item() sbdpreds.append(label) + + # Unsqueeze for input to the POS Encoder sbdpreds = torch.LongTensor(sbdpreds) sbdpreds = torch.unsqueeze(sbdpreds,dim=0) sbdpreds = torch.unsqueeze(sbdpreds, dim=2) sbdpreds = sbdpreds.to(self.device) + # Add the SBD predictions to the POS Encoder Input! posembeddings = torch.cat((avgembeddings,sbdpreds),dim=2) + if mode in ('dev','test'): + # Squeeze these to return to the Trainer for scores, now that we are done with them sbdpreds = torch.squeeze(sbdpreds,dim=2) sbdpreds = torch.squeeze(sbdpreds, dim=0) sbdpreds = sbdpreds.tolist() @@ -405,14 +406,15 @@ def forward(self,data,mode='train'): poslogits = self.hidden2postag(feats) poslogits = self.poscrf(poslogits) + # Some memory management del embeddings - del finalembeddings + del sbdembeddings del avgembeddings + del posembeddings del feats - torch.cuda.empty_cache() - return sbdlogits,finalsbdlabels,sbdpreds, poslogits,poslabels # returns the logits and labels + return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels # returns the logits and labels class Tagger(): def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=512,sbdrnnnumlayers=2,sbdrnnbidirectional=True,sbdrnndropout=0.3,sbdencodertype='lstm',sbdffdim=512,learningrate = 0.001): @@ -512,12 +514,12 @@ def read_file(mode='train'): for epoch in range(1,epochs): + old_batchsize = self.mtlmodel.batch_size self.mtlmodel.train() self.optimizer.zero_grad() - data = sample(trainingdata,SAMPLE_SIZE) + data = sample(trainingdata,self.mtlmodel.batch_size) data = [datum for datum in data if len(datum) == self.mtlmodel.sequence_length] - self.mtlmodel.batch_size = len(data) sbdlogits, sbdlabels, _, poslogits,poslabels = self.mtlmodel(data) @@ -536,18 +538,21 @@ def read_file(mode='train'): mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? mtlloss.backward() - #sbdloss.backward() self.optimizer.step() #self.scheduler.step() + if old_batchsize != self.mtlmodel.batch_size: + self.mtlmodel.batch_size = old_batchsize + self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) self.writer.add_scalar('train_sbd_loss', sbdloss.item(), epoch) self.writer.add_scalar('train_joint_loss', mtlloss.item(), epoch) if epoch % self.evalstep == 0: + self.mtlmodel.eval() - #start = time() + with torch.no_grad(): totalsbddevloss = 0 @@ -558,10 +563,12 @@ def read_file(mode='train'): allpospreds = [] allposgold = [] - invalidlabelscount = 0 - for slice in devdata: + old_seqlen = self.mtlmodel.sequence_length + if len(slice) != self.mtlmodel.sequence_length: # this will happen in one case, for the last slice in the dev batch + self.mtlmodel.sequence_length = len(slice) + sentence = ' '.join([s.split('\t')[0].strip() for s in slice]) sentence = Sentence(sentence,use_tokenizer=False) @@ -572,32 +579,22 @@ def read_file(mode='train'): sbdlogits, sbdlabels, sbdpreds, poslogits, poslabels = self.mtlmodel(slice,mode='dev') - if sbdlabels is not None: + # get the pos predictions + lengths = [self.mtlmodel.sequence_length] + lengths = torch.LongTensor(lengths).to(self.device) + scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) + pospreds = self.mtlmodel.viterbidecoder.decode(scores,False,[sentence]) + pospreds = [self.mtlmodel.postagsetcrf.get_idx_for_item(p[0]) for pr in pospreds[0] for p in pr] - # get the pos predictions - lengths = [self.mtlmodel.sequence_length] - lengths = torch.LongTensor(lengths).to(self.device) - scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) - pospreds = self.mtlmodel.viterbidecoder.decode(scores,False,[sentence]) - pospreds = [self.mtlmodel.postagsetcrf.get_idx_for_item(p[0]) for pr in pospreds[0] for p in pr] - - # get the sbd loss - sbdlogits = sbdlogits.permute(0,2,1) - sbdtags = torch.LongTensor(sbdlabels).to(self.device) - sbddevloss = self.sbdloss(sbdlogits, sbdtags).item() - - # get the pos loss - postags = torch.LongTensor(poslabels) - postags = postags.to(self.device) - posdevloss = self.postagloss(scores,postags).item() - - else: - sbdpreds = [self.mtlmodel.sbd_tag2idx["O"] for _ in goldsbdlabels] - pospreds = [self.mtlmodel.postagset['X'] for _ in goldsbdlabels] - invalidlabelscount += len(goldsbdlabels) - sbddevloss = 0 - posdevloss = 0 + # get the sbd loss + sbdlogits = sbdlogits.permute(0,2,1) + sbdtags = torch.LongTensor(sbdlabels).to(self.device) + sbddevloss = self.sbdloss(sbdlogits, sbdtags).item() + # get the pos loss + postags = torch.LongTensor(poslabels) + postags = postags.to(self.device) + posdevloss = self.postagloss(scores,postags).item() totalsbddevloss += sbddevloss totalposdevloss += posdevloss @@ -607,6 +604,9 @@ def read_file(mode='train'): allpospreds.extend(pospreds) allposgold.extend(goldposlabels) + if self.mtlmodel.sequence_length != old_seqlen: + self.mtlmodel.sequence_length = old_seqlen + goldspans = [] predspans = [] goldstartindex = 0 @@ -624,9 +624,6 @@ def read_file(mode='train'): correctpos = sum([1 if p == g else 0 for p,g in zip(allpospreds,allposgold)]) posscores = Score(len(allposgold),len(allpospreds),correctpos,len(allpospreds)) - print ('invalid labels:' + str(invalidlabelscount)) - print('\n') - self.writer.add_scalar("mtl_dev_loss", round((totalsbddevloss / len(devdata) + (totalposdevloss / len(devdata))), 2), int(epoch / self.evalstep)) print('mtl dev loss:' + str(round((totalsbddevloss / len(devdata) + (totalposdevloss / len(devdata))), 2))) @@ -652,6 +649,7 @@ def read_file(mode='train'): print('pos dev f1:' + str(posscores.f1)) print('pos dev precision:' + str(posscores.precision)) print('pos dev recall:' + str(posscores.recall)) + print('\n') From 835733ae4c5a6aa13a14d5504a5bd57257842f09 Mon Sep 17 00:00:00 2001 From: nitin Date: Wed, 27 Jul 2022 10:24:49 +0800 Subject: [PATCH 12/32] mtl model v1 --- hebpipe/lib/multitask_sentsplitter_postagger.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index 839b5c0..ea45031 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -16,7 +16,6 @@ from time import time os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256" -SAMPLE_SIZE = 16 def spans_score(gold_spans, system_spans): correct, gi, si = 0, 0, 0 @@ -72,7 +71,7 @@ def forward(self, x): class MTLModel(nn.Module): - def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers=2,sbdrnnbidirectional=True,posrnnbidirectional=True,sbdrnndropout=0.3,posrnndropout=0.3,sbdencodertype='lstm',posencodertype='lstm',sbdffdim=512,posffdim=512,batchsize=SAMPLE_SIZE,sbdtransformernumlayers=6,sbdnhead=8,sequencelength=128): + def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers=2,sbdrnnbidirectional=True,posrnnbidirectional=True,sbdrnndropout=0.3,posrnndropout=0.3,sbdencodertype='lstm',posencodertype='lstm',sbdffdim=512,posffdim=512,batchsize=16,sbdtransformernumlayers=4,sbdnhead=4,sequencelength=128): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -93,6 +92,7 @@ def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers= self.batch_size = batchsize # Embedding parameters and model + # Embeddings on the cpu. self.tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base') self.model = BertModel.from_pretrained('onlplab/alephbert-base').to(self.device) self.embeddingdim = 768 @@ -290,6 +290,7 @@ def forward(self,data,mode='train'): tokens = self.tokenizer(sentences,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. embeddings = self.model(**tokens) embeddings = embeddings[0] + #embeddings = embeddings.to(self.device) """ Average the subword embeddings @@ -459,7 +460,7 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr list(self.mtlmodel.hidden2sbd.parameters()) + list(self.mtlmodel.posencoder.parameters()) + list(self.mtlmodel.posfflayer.parameters()) + list(self.mtlmodel.hidden2postag.parameters()) + list(self.mtlmodel.poscrf.parameters()), lr=learningrate) - #self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[150,400],gamma=0.1) + self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[400,1000],gamma=0.1) self.evalstep = 20 self.set_seed(42) @@ -520,6 +521,7 @@ def read_file(mode='train'): data = sample(trainingdata,self.mtlmodel.batch_size) data = [datum for datum in data if len(datum) == self.mtlmodel.sequence_length] + self.mtlmodel.batch_size = len(data) sbdlogits, sbdlabels, _, poslogits,poslabels = self.mtlmodel(data) @@ -536,10 +538,11 @@ def read_file(mode='train'): postags = torch.LongTensor(postags).to(self.device) posloss = self.postagloss(scores,postags) + mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? mtlloss.backward() self.optimizer.step() - #self.scheduler.step() + self.scheduler.step() if old_batchsize != self.mtlmodel.batch_size: self.mtlmodel.batch_size = old_batchsize @@ -550,7 +553,6 @@ def read_file(mode='train'): if epoch % self.evalstep == 0: - self.mtlmodel.eval() with torch.no_grad(): @@ -563,6 +565,7 @@ def read_file(mode='train'): allpospreds = [] allposgold = [] + start = time() for slice in devdata: old_seqlen = self.mtlmodel.sequence_length @@ -604,6 +607,8 @@ def read_file(mode='train'): allpospreds.extend(pospreds) allposgold.extend(goldposlabels) + print ('inference time') + print (time() - start) if self.mtlmodel.sequence_length != old_seqlen: self.mtlmodel.sequence_length = old_seqlen From 920e08814621a385b73c5c408ca94799a00d2ff9 Mon Sep 17 00:00:00 2001 From: nitin Date: Fri, 29 Jul 2022 04:58:53 +0800 Subject: [PATCH 13/32] v1 pipeline --- hebpipe/heb_pipe.py | 336 ++++++------ .../lib/multitask_sentsplitter_postagger.py | 495 +++++++++++++++--- 2 files changed, 608 insertions(+), 223 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 57fd2b3..afee763 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -12,6 +12,8 @@ from stanza.models.common.doc import Document import torch +from time import time + from rftokenizer import RFTokenizer try: # Module usage from .lib.xrenner import Xrenner @@ -22,6 +24,7 @@ from .lib.whitespace_tokenize import add_space_after, tokenize as whitespace_tokenize from .lib.flair_sent_splitter import FlairSentSplitter from .lib.flair_pos_tagger import FlairTagger + from .lib.multitask_sentsplitter_postagger import Tagger except ImportError: # direct script usage from lib.xrenner import Xrenner from lib._version import __version__ @@ -31,6 +34,7 @@ from lib.whitespace_tokenize import add_space_after, tokenize as whitespace_tokenize from lib.flair_sent_splitter import FlairSentSplitter from lib.flair_pos_tagger import FlairTagger + from lib.multitask_sentsplitter_postagger import Tagger PY3 = sys.version_info[0] > 2 @@ -588,177 +592,202 @@ def download_requirements(models_ok=True): def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, do_parse=True, do_entity=True, out_mode="conllu", sent_tag=None, preloaded=None, punct_sentencer=False, from_pipes=False, filecount=1): - data = input_data.replace("\t","") - data = data.replace("\r","") + data = input_data.replace("\t","") + data = data.replace("\r","") - if from_pipes: - input_data = input_data.replace("|","") + if from_pipes: + input_data = input_data.replace("|","") - if preloaded is not None: - rf_tok, xrenner, flair_sent_splitter, parser, tagger, morpher, lemmatizer = preloaded - else: - rf_tok = RFTokenizer(model=model_dir + "heb.sm" + str(sys.version_info[0])) - xrenner = Xrenner(model=model_dir + "heb.xrm") - if sent_tag == "auto" and not punct_sentencer: - flair_sent_splitter = FlairSentSplitter(model_path=model_dir + "heb.sent") - else: - flair_sent_splitter = None - parser = None if not do_parse else Parser.load(model_dir + "heb.diaparser",verbose=False) - tagger = None if not do_tag else FlairTagger() - morpher = None if not do_tag else FlairTagger(morph=True) - lemmatizer = None if not do_lemma and not do_tag else init_lemmatizer() + #start = time.time() + mtltagger = Tagger(trainflag=False) + #print ('mtl init') + #print (time.time() - start) - if do_whitespace: - data = whitespace_tokenize(data, abbr=data_dir + "heb_abbr.tab",add_sents=sent_tag=="auto", from_pipes=from_pipes) - if from_pipes: - tokenized = data - else: - if do_tok: - tokenized = rf_tok.rf_tokenize(data.strip().split("\n")) - tokenized = "\n".join(tokenized) + if preloaded is not None: + rf_tok, xrenner, flair_sent_splitter, parser, tagger, morpher, lemmatizer = preloaded else: - # Assume data is already one token per line - tokenized = data - bound_group_map = get_bound_group_map(tokenized) if out_mode == "conllu" else None - if sent_tag == "auto": - sent_tag = "s" - if punct_sentencer: - tokenized = toks_to_sents(tokenized) + + rf_tok = RFTokenizer(model=model_dir + "heb.sm" + str(sys.version_info[0])) + xrenner = Xrenner(model=model_dir + "heb.xrm") + + """ + if sent_tag == "auto" and not punct_sentencer: + flair_sent_splitter = FlairSentSplitter(model_path=model_dir + "heb.sent") + else: + flair_sent_splitter = None + """ + parser = None if not do_parse else Parser.load(model_dir + "heb.diaparser",verbose=False) + #tagger = None if not do_tag else FlairTagger() + morpher = None if not do_tag else FlairTagger(morph=True) + lemmatizer = None if not do_lemma and not do_tag else init_lemmatizer() + + if do_whitespace: + data = whitespace_tokenize(data, abbr=data_dir + "heb_abbr.tab",add_sents=sent_tag=="auto", from_pipes=from_pipes) + + if from_pipes: + tokenized = data else: - tokenized = flair_sent_splitter.split(tokenized) - if filecount == 1: - # Free up GPU memory if no more files need it - del flair_sent_splitter - torch.cuda.empty_cache() + if do_tok: + tokenized = rf_tok.rf_tokenize(data.strip().split("\n")) + tokenized = "\n".join(tokenized) + else: + # Assume data is already one token per line + tokenized = data - if out_mode == "pipes": - return tokenized - else: - tokenized = tokenized.split("\n") - retokenized = [] - for line in tokenized: - if line == "|": - retokenized.append(line) + bound_group_map = get_bound_group_map(tokenized) if out_mode == "conllu" else None + + """ + if sent_tag == "auto": + sent_tag = "s" + if punct_sentencer: + tokenized = toks_to_sents(tokenized) else: - retokenized.append("\n".join(line.split("|"))) - tokenized = "\n".join(retokenized) - - if do_tag: - # Flair - to_tag = conllize(tokenized,element="s",super_mapping=bound_group_map,attrs_as_comments=True) - tagged_conllu = tagger.predict(to_tag, in_format="conllu", as_text=True) - # Uncomment to test lemmatizer with gold POS tags - #tagged_conllu = io.open("he_htb-ud-test.conllu",encoding="utf8").read() - #opts = type('', (), {"quiet":False, "kill":"both"})() - #d = DepEdit(config_file=[],options=opts) - #tagged_conllu = d.run_depedit(tagged_conllu) + tokenized = flair_sent_splitter.split(tokenized) + if filecount == 1: + # Free up GPU memory if no more files need it + del flair_sent_splitter + torch.cuda.empty_cache() + + if out_mode == "pipes": + return tokenized + else: + tokenized = tokenized.split("\n") + retokenized = [] + for line in tokenized: + if line == "|": + retokenized.append(line) + else: + retokenized.append("\n".join(line.split("|"))) + tokenized = "\n".join(retokenized) + + if do_tag: + # Flair + to_tag = conllize(tokenized,element="s",super_mapping=bound_group_map,attrs_as_comments=True) + tagged_conllu = tagger.predict(to_tag, in_format="conllu", as_text=True) + # Uncomment to test lemmatizer with gold POS tags + #tagged_conllu = io.open("he_htb-ud-test.conllu",encoding="utf8").read() + #opts = type('', (), {"quiet":False, "kill":"both"})() + #d = DepEdit(config_file=[],options=opts) + #tagged_conllu = d.run_depedit(tagged_conllu) + pos_tags = [l.split("\t")[3] for l in tagged_conllu.split("\n") if "\t" in l] + """ + + sent_tag = 's' + start = time() + tagged_conllu, tokenized = mtltagger.split_pos(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/best_sent_pos_model_12.603806_0.866864_0.971045.pt') pos_tags = [l.split("\t")[3] for l in tagged_conllu.split("\n") if "\t" in l] - #morpher = None - if morpher is None: - # Marmot - if platform.system() == "Windows": - tag = ["java","-Dfile.encoding=UTF-8","-Xmx2g","-cp","marmot.jar;trove.jar","marmot.morph.cmd.Annotator","-model-file","heb.marmot","-lemmatizer-file","heb.lemming","-test-file","form-index=0,tempfilename","-pred-file","tempfilename2"] + del mtltagger + del rf_tok + torch.cuda.empty_cache() + + if do_tag: + + #morpher = None + if morpher is None: + # Marmot + if platform.system() == "Windows": + tag = ["java","-Dfile.encoding=UTF-8","-Xmx2g","-cp","marmot.jar;trove.jar","marmot.morph.cmd.Annotator","-model-file","heb.marmot","-lemmatizer-file","heb.lemming","-test-file","form-index=0,tempfilename","-pred-file","tempfilename2"] + else: + tag = ["java","-Dfile.encoding=UTF-8","-Xmx2g","-cp","marmot.jar:trove.jar","marmot.morph.cmd.Annotator","-model-file","heb.marmot","-lemmatizer-file","heb.lemming","-test-file","form-index=0,tempfilename","-pred-file","tempfilename2"] + no_sent = re.sub(r']+)?>\n?','',tokenized).strip() + morphed = exec_via_temp(no_sent, tag, workdir=marmot_path, outfile=True) + morphed = morphed.strip().split("\n") + # Clean up tags for OOV glyphs + cleaned = [] + toknum = 0 + for line in morphed: + if "\t" in line: + fields = line.split("\t") + fields[5] = pos_tags[toknum] # Insert flair tags + if fields[1] in KNOWN_PUNCT: # Hard fix unicode punctuation + fields[5] = "PUNCT" + fields[7] = "_" + line = "\t".join(fields) + toknum += 1 + cleaned.append(line) + # morphed = cleaned + morphs = get_col(morphed, 7) + words = get_col(morphed, 1) + lemmas = get_col(morphed, 3) + tagged = inject_col(morphed, tokenized, 5) else: - tag = ["java","-Dfile.encoding=UTF-8","-Xmx2g","-cp","marmot.jar:trove.jar","marmot.morph.cmd.Annotator","-model-file","heb.marmot","-lemmatizer-file","heb.lemming","-test-file","form-index=0,tempfilename","-pred-file","tempfilename2"] - no_sent = re.sub(r']+)?>\n?','',tokenized).strip() - morphed = exec_via_temp(no_sent, tag, workdir=marmot_path, outfile=True) - morphed = morphed.strip().split("\n") - # Clean up tags for OOV glyphs - cleaned = [] - toknum = 0 - for line in morphed: - if "\t" in line: - fields = line.split("\t") - fields[5] = pos_tags[toknum] # Insert flair tags - if fields[1] in KNOWN_PUNCT: # Hard fix unicode punctuation - fields[5] = "PUNCT" - fields[7] = "_" - line = "\t".join(fields) - toknum += 1 - cleaned.append(line) - # morphed = cleaned - morphs = get_col(morphed, 7) - words = get_col(morphed, 1) - lemmas = get_col(morphed, 3) - tagged = inject_col(morphed, tokenized, 5) - else: - # flair - morphed = morpher.predict(tagged_conllu, in_format="conllu", as_text=True, tags=True) - morphs = get_col(morphed, 4) - words = get_col(morphed, 1) - # Uncomment to test with gold morphology from tagged_conllu - #morphs = get_col(tagged_conllu, 5) - #morphed = inject_col(morphs, tagged_conllu, into_col=5, skip_supertoks=True) - zeros = ["0" for i in range(len(morphs))] - zero_conllu = inject_col(zeros,tagged_conllu,into_col=6, skip_supertoks=True) - lemmas = lemmatize(lemmatizer,zero_conllu,morphs) - tagged = inject_col(tagged_conllu,tokenized,4) - - if do_lemma: - lemmatized = inject_col(lemmas,tagged,-1) - else: - lemmatized = tagged + # flair + morphed = morpher.predict(tagged_conllu, in_format="conllu", as_text=True, tags=True) + morphs = get_col(morphed, 4) + words = get_col(morphed, 1) + # Uncomment to test with gold morphology from tagged_conllu + #morphs = get_col(tagged_conllu, 5) + #morphed = inject_col(morphs, tagged_conllu, into_col=5, skip_supertoks=True) + zeros = ["0" for i in range(len(morphs))] + zero_conllu = inject_col(zeros,tagged_conllu,into_col=6, skip_supertoks=True) + lemmas = lemmatize(lemmatizer,zero_conllu,morphs) + tagged = inject_col(tagged_conllu,tokenized,4) + + if do_lemma: + lemmatized = inject_col(lemmas,tagged,-1) + else: + lemmatized = tagged - morphs = postprocess_morph(morphs, words, lemmas) - morphed = inject_col(morphs,lemmatized,-1) + morphs = postprocess_morph(morphs, words, lemmas) + morphed = inject_col(morphs,lemmatized,-1) - if not do_parse: + if not do_parse: + if out_mode == "conllu": + conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + attrs_as_comments=True) + conllized = add_space_after(input_data,conllized) + return conllized + else: + if not PY3: + morphed = morphed.decode("utf8") + return morphed + + else: if out_mode == "conllu": - conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + conllized = conllize(tokenized, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, attrs_as_comments=True) - conllized = add_space_after(input_data,conllized) + conllized = add_space_after(input_data, conllized) return conllized else: - if not PY3: - morphed = morphed.decode("utf8") - return morphed - - else: - if out_mode == "conllu": - conllized = conllize(tokenized, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True) - conllized = add_space_after(input_data, conllized) - return conllized - else: - return tokenized + return tokenized - if do_parse: - if filecount == 1: - # Free up GPU memory if no more files need it - del morpher - del tagger - torch.cuda.empty_cache() - - conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True, ten_cols=True) - parsed = diaparse(parser, conllized) - parsed = morph_deped.run_depedit(parsed) - - if do_entity: - xrenner.docname = "_" - ents = xrenner.analyze(parsed,"conll_sent") - ents = get_col(ents, -1) - entified = inject_col(ents, parsed, col=-1, into_col=9, skip_supertoks=True) - entified = add_space_after(input_data,entified) - if PY3: - return entified + if do_parse: + if filecount == 1: + # Free up GPU memory if no more files need it + del morpher + del tagger + torch.cuda.empty_cache() + + conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + attrs_as_comments=True, ten_cols=True) + parsed = diaparse(parser, conllized) + parsed = morph_deped.run_depedit(parsed) + + if do_entity: + xrenner.docname = "_" + ents = xrenner.analyze(parsed,"conll_sent") + ents = get_col(ents, -1) + entified = inject_col(ents, parsed, col=-1, into_col=9, skip_supertoks=True) + entified = add_space_after(input_data,entified) + if PY3: + return entified + else: + return entified.decode("utf8") else: - return entified.decode("utf8") + parsed = add_space_after(input_data,parsed) + return parsed else: - parsed = add_space_after(input_data,parsed) - return parsed - else: - if out_mode == "conllu": - conllized = conllize(tagged, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True) - conllized = add_space_after(input_data, conllized) - return conllized - else: - return tagged + if out_mode == "conllu": + conllized = conllize(tagged, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + attrs_as_comments=True) + conllized = add_space_after(input_data, conllized) + return conllized + else: + return tagged def run_hebpipe(): @@ -859,7 +888,8 @@ def run_hebpipe(): else: sys.stderr.write("Aborting\n") sys.exit(0) - tagger = FlairTagger() + #tagger = FlairTagger() + tagger = None morpher = FlairTagger(morph=True) lemmatizer = init_lemmatizer(cpu=opts.cpu, no_post_process=opts.disable_lex) else: @@ -875,7 +905,8 @@ def run_hebpipe(): xrenner = Xrenner(model=model_dir + "heb.xrm") else: xrenner = None - flair_sent_splitter = FlairSentSplitter() if opts.sent == "auto" and not opts.punct_sentencer else None + #flair_sent_splitter = FlairSentSplitter() if opts.sent == "auto" and not opts.punct_sentencer else None + flair_sent_splitter = None dep_parser = Parser.load(model_dir+"heb.diaparser") if opts.dependencies else None for infile in files: @@ -917,5 +948,8 @@ def run_hebpipe(): if __name__ == "__main__": import logging + from time import time logging.disable(logging.INFO) + startpipeline = time() run_hebpipe() + diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index ea45031..70d82e1 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -5,6 +5,7 @@ import shutil import random import math +import re from flair.data import Dictionary, Sentence from transformers import BertModel,BertTokenizerFast @@ -12,25 +13,13 @@ from collections import defaultdict from lib.crfutils.crf import CRF from lib.crfutils.viterbi import ViterbiDecoder,ViterbiLoss +from .reorder_sgml import reorder +from .tt2conll import conllize from time import time os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256" -def spans_score(gold_spans, system_spans): - correct, gi, si = 0, 0, 0 - while gi < len(gold_spans) and si < len(system_spans): - if system_spans[si].start < gold_spans[gi].start: - si += 1 - elif gold_spans[gi].start < system_spans[si].start: - gi += 1 - else: - correct += gold_spans[gi].end == system_spans[si].end - si += 1 - gi += 1 - - return Score(len(gold_spans), len(system_spans), correct) - class Score: def __init__(self, gold_total, system_total, correct, aligned_total=None): self.correct = correct @@ -300,12 +289,12 @@ def forward(self,data,mode='train'): avgembeddings = [] for k in range(0,len(tokens.encodings)): emb = [] - maxindex = max([w for w in tokens.encodings[k].words if w]) + maxindex = max([w for w in tokens.encodings[k].words if w is not None]) try: assert maxindex == self.sequence_length - 1 # otherwise won't average correctly and align with labels except AssertionError: - print ('max index not equal sequence len. Default labels will be applied.') + #print ('max index not equal sequence len. Default labels will be applied.') badrecords.append(k) continue @@ -323,7 +312,7 @@ def forward(self,data,mode='train'): try: assert len(emb) == self.sequence_length # averaging was correct and aligns with the labels except AssertionError: - print ('embedding not built correctly. Default labels will be applied') + #print ('embedding not built correctly. Default labels will be applied') badrecords.append(k) continue @@ -339,13 +328,17 @@ def forward(self,data,mode='train'): self.batch_size -= 1 if mode != 'train': - # squeeze the embedding, as it's a single sentence - avgembeddings = torch.squeeze(avgembeddings) - # shingle the sentence embedding and its label, to calculate the dev loss later - sbdembeddings,finalsbdlabels,finalmapping = self.shingle(avgembeddings,sbdlabels) - # restore dimensionality for the POS tagging pipeline. - avgembeddings = torch.unsqueeze(avgembeddings,dim=0) - + if avgembeddings.size(dim=1) > self.stride_size: # don't shingle if seqlen less than the overlap + # squeeze the embedding, as it's a single sentence + avgembeddings = torch.squeeze(avgembeddings) + # shingle the sentence embedding and its label, to calculate the dev loss later + sbdembeddings,finalsbdlabels,finalmapping = self.shingle(avgembeddings,sbdlabels) + # restore dimensionality for the POS tagging pipeline. + avgembeddings = torch.unsqueeze(avgembeddings,dim=0) + else: + sbdembeddings = avgembeddings + finalsbdlabels = sbdlabels + finalmapping = None else: sbdembeddings = avgembeddings finalsbdlabels = sbdlabels @@ -374,17 +367,20 @@ def forward(self,data,mode='train'): else: # Predict from the shingles for SBD. # 'Believe the span where the token is most in the middle' - sbdpreds = [] - for idx in finalmapping: - snum, position = finalmapping[idx] - label = torch.argmax(sbdlogits[snum][position]).item() - sbdpreds.append(label) - - # Unsqueeze for input to the POS Encoder - sbdpreds = torch.LongTensor(sbdpreds) - sbdpreds = torch.unsqueeze(sbdpreds,dim=0) - sbdpreds = torch.unsqueeze(sbdpreds, dim=2) - sbdpreds = sbdpreds.to(self.device) + if sbdlogits.size(dim=1) > self.stride_size: + sbdpreds = [] + for idx in finalmapping: + snum, position = finalmapping[idx] + label = torch.argmax(sbdlogits[snum][position]).item() + sbdpreds.append(label) + + # Unsqueeze for input to the POS Encoder + sbdpreds = torch.LongTensor(sbdpreds) + sbdpreds = torch.unsqueeze(sbdpreds, dim=0) + sbdpreds = torch.unsqueeze(sbdpreds, dim=2) + sbdpreds = sbdpreds.to(self.device) + else: + sbdpreds = torch.argmax(sbdlogits, dim=2, keepdim=True) # Add the SBD predictions to the POS Encoder Input! posembeddings = torch.cat((avgembeddings,sbdpreds),dim=2) @@ -407,18 +403,10 @@ def forward(self,data,mode='train'): poslogits = self.hidden2postag(feats) poslogits = self.poscrf(poslogits) - # Some memory management - del embeddings - del sbdembeddings - del avgembeddings - del posembeddings - del feats - torch.cuda.empty_cache() - return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels # returns the logits and labels class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=512,sbdrnnnumlayers=2,sbdrnnbidirectional=True,sbdrnndropout=0.3,sbdencodertype='lstm',sbdffdim=512,learningrate = 0.001): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=512,sbdrnnnumlayers=2,sbdrnnbidirectional=True,sbdrnndropout=0.3,sbdencodertype='lstm',sbdffdim=512,learningrate = 0.001,bestmodelpath='../data/checkpoint/'): self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdrnndropout=sbdrnndropout,sbdencodertype=sbdencodertype,sbdffdim=sbdffdim) @@ -429,15 +417,15 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr shutil.rmtree('../data/tensorboarddir/') os.mkdir('../data/tensorboarddir/') - if not os.path.isdir('../data/checkpoint/'): - os.mkdir('../data/checkpoint/') + if not os.path.isdir(bestmodelpath): + os.mkdir(bestmodelpath) self.writer = SummaryWriter('../data/tensorboarddir/') self.trainingdatafile = '../data/sentsplit_postag_train_gold.tab' self.devdatafile = '../data/sentsplit_postag_dev_gold.tab' - else: - self.testdatafile = '../data/sentsplit_postag_test_gold.tab' + + self.bestmodel = bestmodelpath + 'best_sent_pos_model.pt' self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -463,14 +451,12 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[400,1000],gamma=0.1) self.evalstep = 20 - self.set_seed(42) - def set_seed(self, seed): random.seed(seed) torch.manual_seed(seed) - def train(self): + def train(self,checkpointfile=None): def read_file(mode='train'): @@ -508,14 +494,24 @@ def read_file(mode='train'): return dataset - epochs = 1500 + epochs = 1000 + bestloss = float('inf') trainingdata = read_file() devdata = read_file(mode='dev') + if checkpointfile is not None: + checkpoint = torch.load(checkpointfile) + self.mtlmodel.load_state_dict(checkpoint['model_state_dict']) + self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + self.mtlmodel.poscrf.load_state_dict(checkpoint['poscrf_state_dict']) + + self.set_seed(42) + for epoch in range(1,epochs): old_batchsize = self.mtlmodel.batch_size + self.mtlmodel.train() self.optimizer.zero_grad() @@ -539,10 +535,10 @@ def read_file(mode='train'): posloss = self.postagloss(scores,postags) - mtlloss = posloss + sbdloss # uniform weighting. # TODO: learnable weights? + mtlloss = posloss + sbdloss # TODO: learnable weights? mtlloss.backward() self.optimizer.step() - self.scheduler.step() + self.scheduler.step() # TODO: Multi-step LR annealing seems to increase sentence splitting performance. Need a best annealing strategy if old_batchsize != self.mtlmodel.batch_size: self.mtlmodel.batch_size = old_batchsize @@ -551,6 +547,9 @@ def read_file(mode='train'): self.writer.add_scalar('train_sbd_loss', sbdloss.item(), epoch) self.writer.add_scalar('train_joint_loss', mtlloss.item(), epoch) + """"""""""""""""""""""""""""""""""""""""""""" + Do dev evaluation after evalstep number of epochs + """"""""""""""""""""""""""""""""""""""""""""" if epoch % self.evalstep == 0: self.mtlmodel.eval() @@ -566,12 +565,17 @@ def read_file(mode='train'): allposgold = [] start = time() + + # because of shingling for SBD, the dev data needs to be split in slices for inference, as GPU may run out of memory with shingles on the full token list. + # shingling and SBD prediction is done on the individual slice, as well as POS tag predictions. + # TODO This naturally increases prediction time...but can't think of a better way. for slice in devdata: old_seqlen = self.mtlmodel.sequence_length if len(slice) != self.mtlmodel.sequence_length: # this will happen in one case, for the last slice in the dev batch self.mtlmodel.sequence_length = len(slice) + # Flair CRF decoding uses the Sentence object.. sentence = ' '.join([s.split('\t')[0].strip() for s in slice]) sentence = Sentence(sentence,use_tokenizer=False) @@ -580,6 +584,7 @@ def read_file(mode='train'): goldposlabels = [s.split('\t')[1].strip() for s in slice] goldposlabels = [self.mtlmodel.postagsetcrf.get_idx_for_item(s) for s in goldposlabels] + # sbdpreds already contains the sbd predictions. These were necessary for input to the POS encoder. sbdlogits, sbdlabels, sbdpreds, poslogits, poslabels = self.mtlmodel(slice,mode='dev') # get the pos predictions @@ -607,8 +612,8 @@ def read_file(mode='train'): allpospreds.extend(pospreds) allposgold.extend(goldposlabels) - print ('inference time') - print (time() - start) + #print ('inference time') + #print (time() - start) if self.mtlmodel.sequence_length != old_seqlen: self.mtlmodel.sequence_length = old_seqlen @@ -624,27 +629,30 @@ def read_file(mode='train'): if allsbdpreds[i] == 1: predspans.append(UDSpan(predstartindex,i)) predstartindex = i - sbdscores = spans_score(goldspans,predspans) + + sbdscores = self.spans_score(goldspans,predspans) correctpos = sum([1 if p == g else 0 for p,g in zip(allpospreds,allposgold)]) posscores = Score(len(allposgold),len(allpospreds),correctpos,len(allpospreds)) - self.writer.add_scalar("mtl_dev_loss", round((totalsbddevloss / len(devdata) + (totalposdevloss / len(devdata))), 2), + mtlloss = (totalsbddevloss + totalposdevloss) / len(devdata) + + self.writer.add_scalar("mtl_dev_loss", round(mtlloss, 4), int(epoch / self.evalstep)) - print('mtl dev loss:' + str(round((totalsbddevloss / len(devdata) + (totalposdevloss / len(devdata))), 2))) + print('mtl dev loss:' + str(round((totalsbddevloss / len(devdata) + (totalposdevloss / len(devdata))), 4))) - self.writer.add_scalar("sbd_dev_loss",round(totalsbddevloss/len(devdata),2),int(epoch / self.evalstep)) - self.writer.add_scalar("sbd_dev_f1", round(sbdscores.f1,2), int(epoch / self.evalstep)) - self.writer.add_scalar("sbd_dev_precision", round(sbdscores.precision, 2), int(epoch / self.evalstep)) - self.writer.add_scalar("sbd_dev_recall", round(sbdscores.recall, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("sbd_dev_loss",round(totalsbddevloss/len(devdata),4),int(epoch / self.evalstep)) + self.writer.add_scalar("sbd_dev_f1", round(sbdscores.f1,4), int(epoch / self.evalstep)) + self.writer.add_scalar("sbd_dev_precision", round(sbdscores.precision, 4), int(epoch / self.evalstep)) + self.writer.add_scalar("sbd_dev_recall", round(sbdscores.recall, 4), int(epoch / self.evalstep)) print ('\n') - self.writer.add_scalar("pos_dev_loss", round(totalposdevloss / len(devdata), 2), + self.writer.add_scalar("pos_dev_loss", round(totalposdevloss / len(devdata), 4), int(epoch / self.evalstep)) - self.writer.add_scalar("pos_dev_f1", round(posscores.f1, 2), int(epoch / self.evalstep)) - self.writer.add_scalar("pos_dev_precision", round(posscores.precision, 2), + self.writer.add_scalar("pos_dev_f1", round(posscores.f1, 4), int(epoch / self.evalstep)) + self.writer.add_scalar("pos_dev_precision", round(posscores.precision, 4), int(epoch / self.evalstep)) - self.writer.add_scalar("pos_dev_recall", round(posscores.recall, 2), int(epoch / self.evalstep)) + self.writer.add_scalar("pos_dev_recall", round(posscores.recall, 4), int(epoch / self.evalstep)) print ('sbd dev f1:' + str(sbdscores.f1)) print('sbd dev precision:' + str(sbdscores.precision)) @@ -656,10 +664,84 @@ def read_file(mode='train'): print('pos dev recall:' + str(posscores.recall)) print('\n') + if mtlloss < bestloss: + bestloss = mtlloss + bestmodel = self.bestmodel.replace('.pt','_' + str(round(mtlloss,6)) + '_' + str(round(sbdscores.f1,6)) + '_' + str(round(posscores.f1,6)) + '.pt') + torch.save({'epoch':epoch,'model_state_dict':self.mtlmodel.state_dict(),'optimizer_state_dict':self.optimizer.state_dict(),'poscrf_state_dict':self.mtlmodel.poscrf.state_dict()},bestmodel) + + def predict(self,toks,checkpointfile=None): + + + def is_tok(sgml_line): + return len(sgml_line) > 0 and not (sgml_line.startswith("<") and sgml_line.endswith(">")) + + def unescape(token): + token = token.replace(""", '"') + token = token.replace("<", "<") + token = token.replace(">", ">") + token = token.replace("&", "&") + token = token.replace("'", "'") + return token + + slices = [] + toks = unescape(toks) # Splitter is trained on UTF-8 forms, since LM embeddings know characters like '&' + lines = toks.strip().split("\n") + toks = [l for l in lines if is_tok(l)] + toks = [re.sub(r"\t.*", "", t) for t in toks] + + # slice up the token list into slices of seqlen for GPU RAM reasons + for idx in range(0, len(toks), self.mtlmodel.sequence_length): + if idx + self.mtlmodel.sequence_length >= len(toks): + slice = toks[idx:len(toks)] + else: + slice = toks[idx: idx + self.mtlmodel.sequence_length] + + slices.append(slice) + test = [d for slice in slices for d in slice] - def predict(self): - pass + assert len(test) == len(toks) + + + + if checkpointfile is not None: + + checkpoint = torch.load(checkpointfile) + self.mtlmodel.load_state_dict(checkpoint['model_state_dict']) + self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + self.mtlmodel.poscrf.load_state_dict(checkpoint['poscrf_state_dict']) + + self.mtlmodel.eval() + + with torch.no_grad(): + + allsbdpreds = [] + allpospreds = [] + + for slice in slices: + + if len(slice) != self.mtlmodel.sequence_length: # this will happen in one case, for the last slice in the batch + self.mtlmodel.sequence_length = len(slice) + + # Flair CRF decoding uses the Sentence object.. + sentence = ' '.join([s.split('\t')[0].strip() for s in slice]) + sentence = Sentence(sentence, use_tokenizer=False) + + _, _, sbdpreds, poslogits, _ = self.mtlmodel(slice, mode='test') + + # get the pos predictions + lengths = [self.mtlmodel.sequence_length] + lengths = torch.LongTensor(lengths).to(self.device) + scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) + pospreds = self.mtlmodel.viterbidecoder.decode(scores, False, [sentence]) + pospreds = [self.mtlmodel.postagsetcrf.get_idx_for_item(p[0]) for pr in pospreds[0] for p in pr] + + allsbdpreds.extend(sbdpreds) + allpospreds.extend(pospreds) + + allpospreds = [self.mtlmodel.postagsetcrf.get_item_for_index(p) for p in allpospreds] + + return allsbdpreds,allpospreds def prepare_data_files(self): """ @@ -689,7 +771,6 @@ def write_file(filename,mode='train'): write_file(self.trainingdatafile,mode='train') write_file(self.devdatafile,mode='dev') - def read_conllu(self,mode='train'): fields = tuple( @@ -704,18 +785,288 @@ def read_conllu(self,mode='train'): with open(file, "r", encoding="utf-8") as f: return conllu.parse(f.read(), fields=fields) + def split_pos(self, xml_data,out_mode='conllu',checkpointfile = None): + + def is_sgml_tag(line): + return line.startswith("<") and line.endswith(">") + + def wrap_words(xml): + output = [] + lines = xml.split("\n") + for line in lines: + if len(line)>0 and not (line.startswith("<") and line.endswith(">") and not line == "|"): + line = line.replace("|","\n") + line = "<❦♥>\n" + line + "\n" + output.append(line) + return "\n".join(output) + + def collapse_words(sgml): + output = [] + buffer = [] + for line in sgml.split("\n"): + if line in ['<❦♥>',''] or not is_sgml_tag(line): + buffer.append(line) + else: + output.append(line) + if line == "": + piped = "|".join(buffer) + if not (buffer[1] == "|" and len(buffer) == 3): # Actual pipe as token + piped = piped.replace('|','').replace('<❦♥>|','<❦♥>') + output.append(piped) + buffer = [] + return "\n".join(output) + + def fix_malformed_sentences(sgml_list): + """ + Fixing malformed SGML seems to boil down to two cases: + + (1) The sentence is interrupted by the close of a tag that opened before it. In this case, + update the s boundaries so that we close and begin sentences at the close tag: + + + ... + ... + ... + ... ==> + + ... + ... + + + (2) Some tag opened inside of the sentence and has remained unclosed at the time of sentence closure. + In this case, we choose not to believe the sentence split, and merge the two sentences: + + + ... + ... + ... + ==> ... + ... + ... + ... + ... + + """ + tag_opened = defaultdict(list) + i = 0 + while i < len(sgml_list): + line = sgml_list[i].strip() + open_match = re.search(OPEN_SGML_ELT, line) + close_match = re.search(CLOSE_SGML_ELT, line) + if open_match: + tag_opened[open_match.groups()[0]].append(i) + elif close_match: + tagname = close_match.groups()[0] + j = maximal_nontoken_span_end(sgml_list, i + 1) + mns = sgml_list[i:j] + + # case 1: we've encountered a non-s closing tag. If... + if ( + tagname != "s" # the closing tag is not an s + and len(tag_opened["s"]) > 0 # and we're in a sentence + and len(tag_opened[tagname]) > 0 + and len(tag_opened["s"]) > 0 # and the sentence opened after the tag + and tag_opened[tagname][-1] < tag_opened["s"][-1] + and "" not in mns # the sentence is not closed in the mns + ): + # end sentence here and move i back to the line we were looking at + sgml_list.insert(i, "") + i += 1 + # open a new sentence at the end of the mns and note that we are no longer in the sentence + sgml_list.insert(j + 1, "") + tag_opened["s"].pop(-1) + # we have successfully closed this tag + tag_opened[tagname].pop(-1) + # case 2: s closing tag and there's some tag that opened inside of it that isn't closed in time + elif tagname == "s" and any( + e != "s" and f"" not in mns + for e in [ + e + for e in tag_opened.keys() + if len(tag_opened[e]) > 0 and len(tag_opened["s"]) > 0 and tag_opened[e][-1] > + tag_opened["s"][-1] + ] + ): + # some non-s element opened within this sentence and has not been closed even in the mns + assert "" in mns + sgml_list.pop(i) + i -= 1 + sgml_list.pop(i + mns.index("")) + else: + tag_opened[tagname].pop(-1) + i += 1 + return sgml_list + + def maximal_nontoken_span_end(sgml_list, i): + """Return j such that sgml_list[i:j] does not contain tokens + and no element that is begun in the MNS is closed in it.""" + opened = [] + j = i + while j < len(sgml_list): + line = sgml_list[j] + open_match = re.match(OPEN_SGML_ELT, line) + close_match = re.match(CLOSE_SGML_ELT, line) + if not (open_match or close_match): + break + if open_match: + opened.append(open_match.groups()[0]) + if close_match and close_match.groups()[0] in opened: + break + j += 1 + return j + + def get_bound_group_map(data): + + mapping = {} + data = data.split("\n") + # Ignore markup + data = [u for u in data if not (u.startswith("<") and u.endswith(">"))] + counter = 0 + for i, line in enumerate(data): + super_token = line.replace("|", "") if line != "|" else "|" + segs = line.split("|") if line != "|" else ["|"] + for j, seg in enumerate(segs): + if len(segs) > 1 and j == 0: + mapping[counter] = (super_token, len(segs)) + super_token = "" + counter += 1 + + return mapping + + # These XML tags force a sentence break in the data, you can add more here: + BLOCK_TAGS = ["sp", "head", "p", "figure", "caption", "list", "item"] + BLOCK_TAGS += ["❦❦❦"] # reserved tag for sentences in input based on newlines + OPEN_SGML_ELT = re.compile(r"^<([^/ ]+)( .*)?>$") + CLOSE_SGML_ELT = re.compile(r"^$") + + # Sometimes the tokenizer doesn't newline every elt + xml_data = xml_data.replace("><", ">\n<") + # Ad hoc fix for a tokenization error + xml_data = xml_data.replace("°<", "°\n<") + # Remove empty elements? + # for elt in TAGS: + # xml_data = xml_data.replace(f"<{elt}>\n\n", "") + xml_data = wrap_words(xml_data) + + # don't feed the sentencer our pos and lemma predictions, if we have them + no_pos_lemma = re.sub(r"([^\n\t]*?)\t[^\n\t]*?\t[^\n\t]*?\n", r"\1\n", xml_data) + split_indices, pos_tags = self.predict(no_pos_lemma,checkpointfile=checkpointfile) + + # for xml + counter = 0 + splitted = [] + opened_sent = False + para = True + + xml_data = xml_data.replace("","<❦❦❦>").replace("","") + for line in xml_data.strip().split("\n"): + if not is_sgml_tag(line): + # Token + if split_indices[counter] == 1 or para: + if opened_sent: + rev_counter = len(splitted) - 1 + while is_sgml_tag(splitted[rev_counter]) and rev_counter > 0: + rev_counter -= 1 + if rev_counter > 0: + splitted.insert(rev_counter + 1, "") + splitted.append("") + opened_sent = True + para = False + counter += 1 + elif any(f"<{elt}>" in line for elt in BLOCK_TAGS) or any( + f"" in line for elt in BLOCK_TAGS + ): # New block, force sentence split + para = True + splitted.append(line) + + if opened_sent: + rev_counter = len(splitted) - 1 + while is_sgml_tag(splitted[rev_counter]): + rev_counter -= 1 + splitted.insert(rev_counter + 1, "") + + lines = "\n".join(splitted) + lines = re.sub(r'\n?','',lines) + lines = reorder(lines, priorities=["s","❦♥"]) + lines = collapse_words(lines) + + # destroy any xml inside supertokens + while re.search(r'(<❦♥>[^<>]*)<[^❦♥]+>',lines) is not None: + lines = re.sub(r'(<❦♥>[^<>]*)<[^❦♥]+>([^<>]*)',r'\1\2',lines) + + # remove word and sent wrappers + lines = re.sub(r'','',lines) + + lines = reorder(lines) + lines = fix_malformed_sentences(lines.split("\n")) + lines = "\n".join(lines) + lines = reorder(lines) + + # Split out the internal tags within MWT tokens, as these too get a POS tag + lines = lines.split("\n") + retokenized = [] + for line in lines: + if line == "|": + retokenized.append(line) + else: + retokenized.append("\n".join(line.split("|"))) + lines = "\n".join(retokenized) + + """ + Now add the pos tags + """ + bound_group_map = get_bound_group_map(lines) if out_mode == "conllu" else None + data = conllize(lines, element="s", super_mapping=bound_group_map, attrs_as_comments=True) + data = data.strip() + "\n" # Ensure final new line for last sentence + + # add the pos tags to conllized file and remove the rows hyphenated MWT ID + output = [] + tid = 1 + k = 0 + data = data.split('\n') + for i in range(0,len(data)): + if len(data[i].strip())==0: + output.append("") + tid = 1 + else: + if "\t" in data[i]: + fields = data[i].split("\t") + if "." in fields[0] or "-" in fields[0]: continue + else: + fields = [str(tid), fields[1].strip(), "_", pos_tags[k].strip(), pos_tags[k].strip(), "_", "_", "_", "_", "_"] + output.append('\t'.join(fields)) + tid += 1 + k += 1 + + assert k == len(pos_tags) # Fails means pos tags aren't aligned with tokens + + return "\n".join(output), lines + + def spans_score(self, gold_spans, system_spans): + correct, gi, si = 0, 0, 0 + while gi < len(gold_spans) and si < len(system_spans): + if system_spans[si].start < gold_spans[gi].start: + si += 1 + elif gold_spans[gi].start < system_spans[si].start: + gi += 1 + else: + correct += gold_spans[gi].end == system_spans[si].end + si += 1 + gi += 1 + + return Score(len(gold_spans), len(system_spans), correct) + + def main(): # testing only iahltwikitrain = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-train.conllu' iahltwikidev = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-dev.conllu' - tagger = Tagger(trainflag=True,trainfile=iahltwikitrain,devfile=iahltwikidev) tagger.prepare_data_files() tagger.train() - print ('here') if __name__ == "__main__": From fb0648ba20edca9684278573345942f573b967f8 Mon Sep 17 00:00:00 2001 From: nitin Date: Fri, 29 Jul 2022 13:09:00 +0800 Subject: [PATCH 14/32] align with flair architecture --- hebpipe/lib/dropout.py | 54 +++++++++ .../lib/multitask_sentsplitter_postagger.py | 109 +++++++----------- 2 files changed, 96 insertions(+), 67 deletions(-) create mode 100644 hebpipe/lib/dropout.py diff --git a/hebpipe/lib/dropout.py b/hebpipe/lib/dropout.py new file mode 100644 index 0000000..eea9e16 --- /dev/null +++ b/hebpipe/lib/dropout.py @@ -0,0 +1,54 @@ +import torch + + +class LockedDropout(torch.nn.Module): + """ + Implementation of locked (or variational) dropout. Randomly drops out entire parameters in embedding space. + """ + + def __init__(self, dropout_rate=0.5, batch_first=True, inplace=False): + super(LockedDropout, self).__init__() + self.dropout_rate = dropout_rate + self.batch_first = batch_first + self.inplace = inplace + + def forward(self, x): + if not self.training or not self.dropout_rate: + return x + + if not self.batch_first: + m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - self.dropout_rate) + else: + m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout_rate) + + mask = torch.autograd.Variable(m, requires_grad=False) / (1 - self.dropout_rate) + mask = mask.expand_as(x) + return mask * x + + def extra_repr(self): + inplace_str = ", inplace" if self.inplace else "" + return "p={}{}".format(self.dropout_rate, inplace_str) + + +class WordDropout(torch.nn.Module): + """ + Implementation of word dropout. Randomly drops out entire words (or characters) in embedding space. + """ + + def __init__(self, dropout_rate=0.05, inplace=False): + super(WordDropout, self).__init__() + self.dropout_rate = dropout_rate + self.inplace = inplace + + def forward(self, x): + if not self.training or not self.dropout_rate: + return x + + m = x.data.new(x.size(0), x.size(1), 1).bernoulli_(1 - self.dropout_rate) + + mask = torch.autograd.Variable(m, requires_grad=False) + return mask * x + + def extra_repr(self): + inplace_str = ", inplace" if self.inplace else "" + return "p={}{}".format(self.dropout_rate, inplace_str) \ No newline at end of file diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index 70d82e1..2fcb439 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -8,6 +8,7 @@ import re from flair.data import Dictionary, Sentence +from lib.dropout import WordDropout,LockedDropout from transformers import BertModel,BertTokenizerFast from random import sample from collections import defaultdict @@ -39,28 +40,9 @@ def __init__(self, start, end): # so we can use characters[start:end] or range(start, end). self.end = end -class PositionalEncoding(nn.Module): - - def __init__(self, d_model, dropout=0.1, max_len=5000): - super(PositionalEncoding, self).__init__() - self.dropout = nn.Dropout(p=dropout) - - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0).transpose(0, 1) - self.register_buffer('pe', pe) - - def forward(self, x): - x = x + self.pe[:x.size(0), :] - return self.dropout(x) - - class MTLModel(nn.Module): - def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers=2,sbdrnnbidirectional=True,posrnnbidirectional=True,sbdrnndropout=0.3,posrnndropout=0.3,sbdencodertype='lstm',posencodertype='lstm',sbdffdim=512,posffdim=512,batchsize=16,sbdtransformernumlayers=4,sbdnhead=4,sequencelength=128): + def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers=1,sbdrnnbidirectional=True,posrnnbidirectional=True,sbdencodertype='lstm',posencodertype='lstm',batchsize=16,sequencelength=128,dropout=0.0,wordropout=0.05,lockeddropout=0.5): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -90,28 +72,20 @@ def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers= self.sbdrnndim = sbdrnndim self.sbdrnnnumlayers = sbdrnnnumlayers self.sbdrnnbidirectional = sbdrnnbidirectional - self.sbdrnndropout = sbdrnndropout #Bi-LSTM Encoder for POS tagging self.posrnndim = posrnndim self.posrnnnumlayers = posrnnnumlayers self.posrnnbidirectional = posrnnbidirectional - self.posrnndropout = posrnndropout if sbdencodertype == 'lstm': self.sbdencoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.sbdrnndim // 2, num_layers=self.sbdrnnnumlayers, bidirectional=self.sbdrnnbidirectional, - dropout=self.sbdrnndropout,batch_first=True).to(self.device) + batch_first=True).to(self.device) elif sbdencodertype == 'gru': self.sbdencoder = nn.GRU(input_size=self.embeddingdim, hidden_size=self.sbdrnndim // 2, num_layers=self.sbdrnnnumlayers, bidirectional=self.sbdrnnbidirectional, - dropout=self.sbdrnndropout,batch_first=True).to(self.device) - elif sbdencodertype == 'transformer': - self.sbdtransformernumlayers = sbdtransformernumlayers - self.sbdnhead = sbdnhead - self.sbdencoderlayer = nn.TransformerEncoderLayer(d_model= self.embeddingdim,nhead=self.sbdnhead).to(self.device) - self.sbdencoder = nn.TransformerEncoder(self.sbdencoderlayer,num_layers=self.sbdtransformernumlayers).to(self.device) - self.sbdposencoder = PositionalEncoding(d_model=self.embeddingdim).to(self.device) + batch_first=True).to(self.device) # param init for name, param in self.sbdencoder.named_parameters(): @@ -126,11 +100,11 @@ def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers= if posencodertype == 'lstm': self.posencoder = nn.LSTM(input_size=self.embeddingdim + 1, hidden_size=self.posrnndim // 2, num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, - dropout=self.posrnndropout,batch_first=True).to(self.device) + batch_first=True).to(self.device) elif posencodertype == 'gru': self.posencoder = nn.GRU(input_size=self.embeddingdim + 1, hidden_size=self.posrnndim // 2, num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, - dropout=self.posrnndropout,batch_first=True).to(self.device) + batch_first=True).to(self.device) # param init for name, param in self.posencoder.named_parameters(): @@ -142,35 +116,30 @@ def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers= except ValueError as ex: nn.init.constant_(param, 0.0) - self.relu = nn.ReLU() + #self.relu = nn.ReLU() - # Intermediate feedforward layer - self.sbdffdim = sbdffdim - if sbdencodertype == 'transformer': - self.sbdfflayer = nn.Linear(in_features=self.embeddingdim, out_features=self.sbdffdim).to(self.device) - else: - self.sbdfflayer = nn.Linear(in_features=self.sbdrnndim, out_features=self.sbdffdim).to(self.device) + # Reproject embeddings layer + self.sbdembedding2nn = nn.Linear(in_features=self.embeddingdim,out_features=self.embeddingdim).to(self.device) # param init - for name, param in self.sbdfflayer.named_parameters(): + for name, param in self.sbdembedding2nn.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_normal_(param) # Intermediate feedforward layer - self.posffdim = posffdim - self.posfflayer = nn.Linear(in_features=self.posrnndim, out_features=self.posffdim).to(self.device) + self.posembedding2nn = nn.Linear(in_features=self.embeddingdim + 1,out_features=self.embeddingdim + 1).to(self.device) # param init - for name, param in self.posfflayer.named_parameters(): + for name, param in self.posembedding2nn.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_normal_(param) # Label space for the pos tagger - self.hidden2postag = nn.Linear(in_features=self.posffdim,out_features=len(self.postagsetcrf)).to(self.device) + self.hidden2postag = nn.Linear(in_features=self.posrnndim,out_features=len(self.postagsetcrf)).to(self.device) for name, param in self.hidden2postag.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) @@ -178,7 +147,7 @@ def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers= nn.init.xavier_normal_(param) # Label space for sent splitter - self.hidden2sbd = nn.Linear(in_features=self.sbdffdim,out_features=len(self.sbd_tag2idx.keys())).to(self.device) + self.hidden2sbd = nn.Linear(in_features=self.sbdrnndim,out_features=len(self.sbd_tag2idx.keys())).to(self.device) # param init for name, param in self.hidden2sbd.named_parameters(): @@ -187,9 +156,10 @@ def __init__(self,sbdrnndim=512,posrnndim=512,sbdrnnnumlayers=2,posrnnnumlayers= elif 'weight' in name: nn.init.xavier_normal_(param) - self.sigmoid = nn.Sigmoid() - self.dropout = nn.Dropout(p=0.5) - self.embeddingdropout = nn.Dropout(p=0.1) + #self.sigmoid = nn.Sigmoid() + self.dropout = nn.Dropout(dropout) + self.worddropout = WordDropout(wordropout) + self.lockeddropout = LockedDropout(lockeddropout) self.poscrf = CRF(self.postagsetcrf,len(self.postagsetcrf),init_from_state_dict=False) # TODO: parameterize self.viterbidecoder = ViterbiDecoder(self.postagsetcrf) @@ -344,19 +314,16 @@ def forward(self,data,mode='train'): finalsbdlabels = sbdlabels finalmapping = None - sbdembeddings = self.embeddingdropout(sbdembeddings) + sbdembeddings = self.dropout(sbdembeddings) + sbdembeddings = self.worddropout(sbdembeddings) + sbdembeddings = self.lockeddropout(sbdembeddings) - # SBD encoder and labels - if self.sbdencodertype in ('lstm','gru'): - feats, _ = self.sbdencoder(sbdembeddings) - else: - feats = self.sbdposencoder(sbdembeddings) - feats = self.sbdencoder(feats) + sbdembeddings = self.sbdembedding2nn(sbdembeddings) - # SBD Intermediate Feedforward layer - feats = self.sbdfflayer(feats) - feats = self.relu(feats) + # SBD encoder and labels + feats, _ = self.sbdencoder(sbdembeddings) feats = self.dropout(feats) + feats = self.lockeddropout(feats) # SBD logits sbdlogits = self.hidden2sbd(feats) @@ -393,22 +360,25 @@ def forward(self,data,mode='train'): else: sbdpreds = None - if self.posencodertype in ('lstm','gru'): - feats,_ = self.posencoder(posembeddings) + posembeddings = self.dropout(posembeddings) + posembeddings = self.worddropout(posembeddings) + posembeddings = self.lockeddropout(posembeddings) + posembeddings = self.posembedding2nn(posembeddings) - # logits for pos - feats = self.posfflayer(feats) - feats = self.relu(feats) + feats,_ = self.posencoder(posembeddings) feats = self.dropout(feats) + feats = self.lockeddropout(feats) + + # logits for pos poslogits = self.hidden2postag(feats) poslogits = self.poscrf(poslogits) return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels # returns the logits and labels class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=512,sbdrnnnumlayers=2,sbdrnnbidirectional=True,sbdrnndropout=0.3,sbdencodertype='lstm',sbdffdim=512,learningrate = 0.001,bestmodelpath='../data/checkpoint/'): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=128,sbdrnnnumlayers=1,sbdrnnbidirectional=True,dropout=0.0,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/'): - self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdrnndropout=sbdrnndropout,sbdencodertype=sbdencodertype,sbdffdim=sbdffdim) + self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout) if trainflag == True: @@ -444,8 +414,8 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr self.sbdloss = nn.CrossEntropyLoss(weight=torch.FloatTensor([1,3])) self.sbdloss.to(self.device) - self.optimizer = torch.optim.AdamW(list(self.mtlmodel.sbdencoder.parameters()) + list(self.mtlmodel.sbdfflayer.parameters()) + - list(self.mtlmodel.hidden2sbd.parameters()) + list(self.mtlmodel.posencoder.parameters()) + list(self.mtlmodel.posfflayer.parameters()) + self.optimizer = torch.optim.AdamW(list(self.mtlmodel.sbdencoder.parameters()) + list(self.mtlmodel.sbdembedding2nn.parameters()) + + list(self.mtlmodel.hidden2sbd.parameters()) + list(self.mtlmodel.posencoder.parameters()) + list(self.mtlmodel.posembedding2nn.parameters()) + list(self.mtlmodel.hidden2postag.parameters()) + list(self.mtlmodel.poscrf.parameters()), lr=learningrate) self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[400,1000],gamma=0.1) @@ -1063,7 +1033,12 @@ def main(): # testing only iahltwikitrain = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-train.conllu' iahltwikidev = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-dev.conllu' + htbdev = '/home/nitin/Desktop/htb/UD_Hebrew/he_htb-ud-dev.conllu' + htbtrain = '/home/nitin/Desktop/htb/UD_Hebrew/he_htb-ud-train.conllu' + tagger = Tagger(trainflag=True,trainfile=iahltwikitrain,devfile=iahltwikidev) + #tagger = Tagger(trainflag=True,trainfile=htbtrain,devfile=htbdev) + tagger.prepare_data_files() tagger.train() From 4450c9c54356f48b5fa1d56b02f5750850c7b028 Mon Sep 17 00:00:00 2001 From: nitin Date: Thu, 4 Aug 2022 02:28:13 +0800 Subject: [PATCH 15/32] final results; integration pending --- hebpipe/heb_pipe.py | 18 +- .../lib/multitask_sentsplitter_postagger.py | 242 ++++++++++++++---- 2 files changed, 200 insertions(+), 60 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index afee763..c05dfc1 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -13,6 +13,9 @@ import torch from time import time +import cProfile +import pstats + from rftokenizer import RFTokenizer try: # Module usage @@ -598,18 +601,16 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, if from_pipes: input_data = input_data.replace("|","") - #start = time.time() - mtltagger = Tagger(trainflag=False) - #print ('mtl init') - #print (time.time() - start) + # Wiki + mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=256,sbdrnndim=512,posrnndim=512,sbdfflayerdim=512) + # HTB + #mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=320,sbdrnndim=256,posrnndim=512,sbdfflayerdim=256) if preloaded is not None: + rf_tok, xrenner, flair_sent_splitter, parser, tagger, morpher, lemmatizer = preloaded else: - - - rf_tok = RFTokenizer(model=model_dir + "heb.sm" + str(sys.version_info[0])) xrenner = Xrenner(model=model_dir + "heb.xrm") @@ -676,8 +677,7 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, """ sent_tag = 's' - start = time() - tagged_conllu, tokenized = mtltagger.split_pos(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/best_sent_pos_model_12.603806_0.866864_0.971045.pt') + tagged_conllu, tokenized = mtltagger.split_pos(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/top_wiki_best_sent_pos_model_17.477738_0.857963_0.972323.pt') pos_tags = [l.split("\t")[3] for l in tagged_conllu.split("\n") if "\t" in l] del mtltagger diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index 2fcb439..e11436d 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -4,22 +4,23 @@ import os import shutil import random -import math import re +import argparse + from flair.data import Dictionary, Sentence from lib.dropout import WordDropout,LockedDropout -from transformers import BertModel,BertTokenizerFast +from transformers import BertModel,BertTokenizerFast,BertConfig from random import sample from collections import defaultdict from lib.crfutils.crf import CRF from lib.crfutils.viterbi import ViterbiDecoder,ViterbiLoss -from .reorder_sgml import reorder -from .tt2conll import conllize +from lib.reorder_sgml import reorder +from lib.tt2conll import conllize from time import time -os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256" +#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256" class Score: def __init__(self, gold_total, system_total, correct, aligned_total=None): @@ -42,7 +43,7 @@ def __init__(self, start, end): class MTLModel(nn.Module): - def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers=1,sbdrnnbidirectional=True,posrnnbidirectional=True,sbdencodertype='lstm',posencodertype='lstm',batchsize=16,sequencelength=128,dropout=0.0,wordropout=0.05,lockeddropout=0.5): + def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers=1,posfflayerdim=512,sbdrnnbidirectional=True,posrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posencodertype='lstm',batchsize=16,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -50,6 +51,7 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= # tagsets - amend labels here self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? self.sbd_tag2idx = {'B-SENT': 1,'O': 0} + self.supertokenset = {'O':0,'B':1,'I':2,'E':3} # POS tagset in Dictionary object for Flair CRF self.postagsetcrf = Dictionary() @@ -63,10 +65,14 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= self.batch_size = batchsize # Embedding parameters and model - # Embeddings on the cpu. + config = BertConfig.from_pretrained('onlplab/alephbert-base',output_hidden_states=True) self.tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base') - self.model = BertModel.from_pretrained('onlplab/alephbert-base').to(self.device) + self.model = BertModel.from_pretrained('onlplab/alephbert-base',config=config).to(self.device) self.embeddingdim = 768 + self.lastn = 4 + + for param in self.model.base_model.parameters(): + param.requires_grad = False # Bi-LSTM Encoder for SBD self.sbdrnndim = sbdrnndim @@ -98,11 +104,11 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= nn.init.constant_(param,0.0) if posencodertype == 'lstm': - self.posencoder = nn.LSTM(input_size=self.embeddingdim + 1, hidden_size=self.posrnndim // 2, + self.posencoder = nn.LSTM(input_size=self.embeddingdim + 5, hidden_size=self.posrnndim // 2, num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, batch_first=True).to(self.device) elif posencodertype == 'gru': - self.posencoder = nn.GRU(input_size=self.embeddingdim + 1, hidden_size=self.posrnndim // 2, + self.posencoder = nn.GRU(input_size=self.embeddingdim + 5, hidden_size=self.posrnndim // 2, num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, batch_first=True).to(self.device) @@ -116,10 +122,12 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= except ValueError as ex: nn.init.constant_(param, 0.0) - #self.relu = nn.ReLU() + self.relu = nn.ReLU() # Reproject embeddings layer - self.sbdembedding2nn = nn.Linear(in_features=self.embeddingdim,out_features=self.embeddingdim).to(self.device) + self.sbdembedding2nn = nn.Linear(in_features=self.embeddingdim ,out_features=self.embeddingdim).to(self.device) + self.sbdfflayerdim = sbdfflayerdim + self.sbdfflayer = nn.Linear(in_features=self.sbdrnndim, out_features=self.sbdfflayerdim).to(self.device) # param init for name, param in self.sbdembedding2nn.named_parameters(): @@ -129,7 +137,9 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= nn.init.xavier_normal_(param) # Intermediate feedforward layer - self.posembedding2nn = nn.Linear(in_features=self.embeddingdim + 1,out_features=self.embeddingdim + 1).to(self.device) + self.posembedding2nn = nn.Linear(in_features=self.embeddingdim + 5,out_features=self.embeddingdim + 5).to(self.device) + self.posfflayerdim = posfflayerdim + self.posfflayer = nn.Linear(in_features=self.posrnndim, out_features=self.posfflayerdim).to(self.device) # param init for name, param in self.posembedding2nn.named_parameters(): @@ -139,7 +149,7 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= nn.init.xavier_normal_(param) # Label space for the pos tagger - self.hidden2postag = nn.Linear(in_features=self.posrnndim,out_features=len(self.postagsetcrf)).to(self.device) + self.hidden2postag = nn.Linear(in_features=self.posfflayerdim,out_features=len(self.postagsetcrf)).to(self.device) for name, param in self.hidden2postag.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) @@ -147,7 +157,7 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= nn.init.xavier_normal_(param) # Label space for sent splitter - self.hidden2sbd = nn.Linear(in_features=self.sbdrnndim,out_features=len(self.sbd_tag2idx.keys())).to(self.device) + self.hidden2sbd = nn.Linear(in_features=self.sbdfflayerdim,out_features=len(self.sbd_tag2idx.keys())).to(self.device) # param init for name, param in self.hidden2sbd.named_parameters(): @@ -235,24 +245,60 @@ def forward(self,data,mode='train'): sentences = [' '.join([s.split('\t')[0].strip() for s in sls]) for sls in data] sbdlabels = [[self.sbd_tag2idx[s.split('\t')[2].strip()] for s in sls] for sls in data] poslabels = [[self.postagsetcrf.get_idx_for_item(s.split('\t')[1].strip()) for s in sls] for sls in data] + + supertokenlabels = [] + for sls in data: + record = [] + for s in sls: + temp = [0] * len(self.supertokenset) + temp[self.supertokenset[s.split('\t')[-1].strip()]] = 1 + record.append(temp) + supertokenlabels.append(record) + elif mode == 'dev': # inference is on a single record sentences = [' '.join([s.split('\t')[0].strip() for s in data])] sbdlabels = [self.sbd_tag2idx[s.split('\t')[2].strip()] for s in data] poslabels = [self.postagsetcrf.get_idx_for_item(s.split('\t')[1].strip()) for s in data] - else: # test - has no labels, and 2D tensor single record - sentences = [' '.join([s.split('\t')[0].strip() for s in data])] + + supertokenlabels = [] + for s in data: + temp = [0] * len(self.supertokenset) + temp[self.supertokenset[s.split('\t')[-1].strip()]] = 1 + supertokenlabels.append(temp) + + else: # test - a tuple of text and supertoken labels + sentences = [' '.join([s.split('\t')[0].strip() for s in data[0]])] + + supertokenlabels = [] + for s in data[1]: + temp = [0] * len(self.supertokenset) + temp[self.supertokenset[s.strip()]] = 1 + supertokenlabels.append(temp) + sbdlabels = None poslabels = None - # Make embeddings + # Make embeddings and scalar average them across subwords, vertically. sentences = [d.split() for d in sentences] # for AlephBERT - tokens = self.tokenizer(sentences,return_tensors='pt',padding=True,is_split_into_words=True).to(self.device) # tell AlephBERT that there is some tokenization already. - embeddings = self.model(**tokens) - embeddings = embeddings[0] + tokens = self.tokenizer(sentences,return_tensors='pt',padding=True,is_split_into_words=True,truncation=True).to(self.device) # tell AlephBERT that there is some tokenization already. + try: + output = self.model(**tokens) + except Exception: + print ('here') + raise + hiddenstates = output[2][-self.lastn:] + scalarsum = hiddenstates[0] + for i in range(1,self.lastn): + scalarsum = torch.add(scalarsum,hiddenstates[i],alpha=1) + + embeddings = torch.div(scalarsum,self.lastn) + #embeddings = embeddings.to(self.device) + + #embeddings = embeddings[0] #embeddings = embeddings.to(self.device) """ - Average the subword embeddings + Average the subword embeddings within the horizontal sequence. This process will drop the [CLS],[SEP] and [PAD] tokens """ @@ -272,7 +318,7 @@ def forward(self,data,mode='train'): indices = [j for j,x in enumerate(tokens.encodings[k].words) if x == i] if len(indices) == 0: # This strange case needs to be handled. - emb.append(torch.zeros(self.embeddingdim,device=self.device)) + emb.append(torch.zeros(self.embeddingdim ,device=self.device)) elif len(indices) == 1: # no need to average emb.append(embeddings[k][indices[0]]) else: # needs to aggregate - average @@ -322,6 +368,8 @@ def forward(self,data,mode='train'): # SBD encoder and labels feats, _ = self.sbdencoder(sbdembeddings) + feats = self.sbdfflayer(feats) + feats = self.relu(feats) feats = self.dropout(feats) feats = self.lockeddropout(feats) @@ -349,8 +397,13 @@ def forward(self,data,mode='train'): else: sbdpreds = torch.argmax(sbdlogits, dim=2, keepdim=True) + supertokenlabels = torch.LongTensor(supertokenlabels) + supertokenlabels = supertokenlabels.to(self.device) + if mode in ('dev','test'): + supertokenlabels = torch.unsqueeze(supertokenlabels,dim=0) + # Add the SBD predictions to the POS Encoder Input! - posembeddings = torch.cat((avgembeddings,sbdpreds),dim=2) + posembeddings = torch.cat((avgembeddings,sbdpreds,supertokenlabels),dim=2) if mode in ('dev','test'): # Squeeze these to return to the Trainer for scores, now that we are done with them @@ -366,6 +419,8 @@ def forward(self,data,mode='train'): posembeddings = self.posembedding2nn(posembeddings) feats,_ = self.posencoder(posembeddings) + feats = self.posfflayer(feats) + feats = self.relu(feats) feats = self.dropout(feats) feats = self.lockeddropout(feats) @@ -376,9 +431,9 @@ def forward(self,data,mode='train'): return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels # returns the logits and labels class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=128,sbdrnnnumlayers=1,sbdrnnbidirectional=True,dropout=0.0,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/'): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=128,sbdrnnnumlayers=1,sbdrnnbidirectional=True,sbdfflayerdim=256,posrnndim=256,posrnnnumlayers=1,posrnnbidirectional=True,posfflayerdim=512,dropout=0.05,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',posencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/',batchsize=32,sequencelength=256,datatype='htb'): - self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout) + self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posrnndim=posrnndim,posrnnbidirectional=posrnnbidirectional,posencodertype=posencodertype,posrnnnumlayers=posrnnnumlayers,posfflayerdim=posfflayerdim,batchsize=batchsize,sequencelength=sequencelength) if trainflag == True: @@ -395,7 +450,7 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr self.trainingdatafile = '../data/sentsplit_postag_train_gold.tab' self.devdatafile = '../data/sentsplit_postag_dev_gold.tab' - self.bestmodel = bestmodelpath + 'best_sent_pos_model.pt' + self.bestmodel = bestmodelpath + datatype + '_best_sent_pos_model.pt' self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -418,8 +473,8 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr list(self.mtlmodel.hidden2sbd.parameters()) + list(self.mtlmodel.posencoder.parameters()) + list(self.mtlmodel.posembedding2nn.parameters()) + list(self.mtlmodel.hidden2postag.parameters()) + list(self.mtlmodel.poscrf.parameters()), lr=learningrate) - self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer,milestones=[400,1000],gamma=0.1) - self.evalstep = 20 + self.scheduler = torch.optim.lr_scheduler.CyclicLR(self.optimizer,base_lr=learningrate/10,max_lr=learningrate,step_size_up=250,cycle_momentum=False) + self.evalstep = 50 def set_seed(self, seed): @@ -464,7 +519,7 @@ def read_file(mode='train'): return dataset - epochs = 1000 + epochs = 3500 bestloss = float('inf') trainingdata = read_file() @@ -508,7 +563,7 @@ def read_file(mode='train'): mtlloss = posloss + sbdloss # TODO: learnable weights? mtlloss.backward() self.optimizer.step() - self.scheduler.step() # TODO: Multi-step LR annealing seems to increase sentence splitting performance. Need a best annealing strategy + self.scheduler.step() if old_batchsize != self.mtlmodel.batch_size: self.mtlmodel.batch_size = old_batchsize @@ -567,8 +622,10 @@ def read_file(mode='train'): # get the sbd loss sbdlogits = sbdlogits.permute(0,2,1) sbdtags = torch.LongTensor(sbdlabels).to(self.device) + sbddevloss = self.sbdloss(sbdlogits, sbdtags).item() + # get the pos loss postags = torch.LongTensor(poslabels) postags = postags.to(self.device) @@ -641,7 +698,6 @@ def read_file(mode='train'): def predict(self,toks,checkpointfile=None): - def is_tok(sgml_line): return len(sgml_line) > 0 and not (sgml_line.startswith("<") and sgml_line.endswith(">")) @@ -656,24 +712,49 @@ def unescape(token): slices = [] toks = unescape(toks) # Splitter is trained on UTF-8 forms, since LM embeddings know characters like '&' lines = toks.strip().split("\n") + + # add super token tags + supertokenlabels = [] + for i in range(0,len(lines)): + if i > 0: + prevtoken = lines[i-1] + if i < len(lines) - 1: + nexttoken = lines[i + 1] + + currtoken = lines[i] + + if is_tok(currtoken): + if not is_tok(prevtoken): + if not is_tok(nexttoken): + supertokenlabels.append("O") + else: + supertokenlabels.append("B") + else: + if not is_tok(nexttoken): + supertokenlabels.append("E") + else: + supertokenlabels.append("I") + toks = [l for l in lines if is_tok(l)] toks = [re.sub(r"\t.*", "", t) for t in toks] + assert len(toks) == len(supertokenlabels) + # slice up the token list into slices of seqlen for GPU RAM reasons for idx in range(0, len(toks), self.mtlmodel.sequence_length): if idx + self.mtlmodel.sequence_length >= len(toks): slice = toks[idx:len(toks)] + supertokenslice = supertokenlabels[idx:len(toks)] else: slice = toks[idx: idx + self.mtlmodel.sequence_length] + supertokenslice = supertokenlabels[idx: idx + self.mtlmodel.sequence_length] - slices.append(slice) + slices.append((slice,supertokenslice)) - test = [d for slice in slices for d in slice] + test = [d for s in slices for d in s[0]] assert len(test) == len(toks) - - if checkpointfile is not None: checkpoint = torch.load(checkpointfile) @@ -690,11 +771,11 @@ def unescape(token): for slice in slices: - if len(slice) != self.mtlmodel.sequence_length: # this will happen in one case, for the last slice in the batch - self.mtlmodel.sequence_length = len(slice) + if len(slice[0]) != self.mtlmodel.sequence_length: # this will happen in one case, for the last slice in the batch + self.mtlmodel.sequence_length = len(slice[0]) # Flair CRF decoding uses the Sentence object.. - sentence = ' '.join([s.split('\t')[0].strip() for s in slice]) + sentence = ' '.join([s.split('\t')[0].strip() for s in slice[0]]) sentence = Sentence(sentence, use_tokenizer=False) _, _, sbdpreds, poslogits, _ = self.mtlmodel(slice, mode='test') @@ -725,15 +806,36 @@ def write_file(filename,mode='train'): data = traindata with open(filename,'w') as tr: + length = -1 for sent in data: - for i in range(0,len(sent)): - if isinstance(sent[i]['id'], tuple): continue # MWE conventions in the conllu file + i = 0 + while i < len(sent): + if isinstance(sent[i]['id'], tuple): + # fetch the super token tag + supertoken = 'B' + length = sent[i]['id'][-1] - sent[i]['id'][0] + start = sent[i]['id'][0] + i += 1 + continue + elif length > 0 and supertoken in ('B','I'): + if sent[i]['id'] == start: + supertoken = 'B' + else: + supertoken = 'I' + length -=1 + elif length == 0: + supertoken = 'E' + length = -1 + elif length == -1: + supertoken = 'O' if sent[i]['id'] == 1: - tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'B-SENT' + '\n') + tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'B-SENT' + '\t' + supertoken + '\n') else: - tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'O' + '\n') + tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'O' + '\t' + supertoken + '\n') + + i += 1 traindata = self.read_conllu() devdata = self.read_conllu(mode='dev') @@ -1030,16 +1132,54 @@ def spans_score(self, gold_spans, system_spans): def main(): # testing only - iahltwikitrain = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-train.conllu' - iahltwikidev = '/home/nitin/Desktop/IAHLT/UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-dev.conllu' - - htbdev = '/home/nitin/Desktop/htb/UD_Hebrew/he_htb-ud-dev.conllu' - htbtrain = '/home/nitin/Desktop/htb/UD_Hebrew/he_htb-ud-train.conllu' - - tagger = Tagger(trainflag=True,trainfile=iahltwikitrain,devfile=iahltwikidev) - #tagger = Tagger(trainflag=True,trainfile=htbtrain,devfile=htbdev) + parser = argparse.ArgumentParser() + parser.add_argument('--lr', type=float, default=1e-3) + parser.add_argument('--seqlen', type=int, default=192) + parser.add_argument('--trainbatch', type=int, default=16) + parser.add_argument('--datatype', type=str, default='wiki') + parser.add_argument('--sbdrnndim', type=int, default=256) + parser.add_argument('--posrnndim', type=int, default=512) + parser.add_argument('--sbdfflayerdim', type=int, default=256) + parser.add_argument('--posfflayerdim', type=int, default=512) + parser.add_argument('--posrnnbidirectional', type=bool, default=True) + parser.add_argument('--sbdrnnbidirectional', type=bool, default=True) + parser.add_argument('--posrnnnumlayers', type=int, default=1) + parser.add_argument('--sbdrnnnumlayers', type=int, default=1) + parser.add_argument('--sbdencodertype', type=str, default='lstm') + parser.add_argument('--posencodertype', type=str, default='lstm') + parser.add_argument('--dropout', type=float, default=0.05) + parser.add_argument('--worddropout', type=float, default=0.05) + parser.add_argument('--lockeddropout', type=float, default=0.5) + + + args = parser.parse_args() + + iahltwikitrain = '../he_iahltwiki-ud-train.conllu' + iahltwikidev = '../he_iahltwiki-ud-dev.conllu' + + htbdev = '../he_htb-ud-dev.conllu' + htbtrain = '../he_htb-ud-train.conllu' + + if args.datatype == 'wiki': + tagger = Tagger(trainflag=True, trainfile=iahltwikitrain, devfile=iahltwikidev, sbdrnndim=args.sbdrnndim, sbdfflayerdim=args.sbdfflayerdim, + posrnndim=args.posrnndim, posfflayerdim=args.posfflayerdim, sbdrnnbidirectional=args.sbdrnnbidirectional, + posrnnbidirectional=args.posrnnbidirectional, sbdrnnnumlayers=args.sbdrnnnumlayers, + posrnnnumlayers=args.posrnnnumlayers, sbdencodertype=args.sbdencodertype, + posencodertype=args.posencodertype + , learningrate=args.lr, batchsize=args.trainbatch, sequencelength=args.seqlen, + dropout=args.dropout, wordropout=args.worddropout, lockeddropout=args.lockeddropout,datatype=args.datatype) + + else: + tagger = Tagger(trainflag=True, trainfile=htbtrain, devfile=htbdev, sbdrnndim=args.sbdrnndim,sbdfflayerdim=args.sbdfflayerdim, + posrnndim=args.posrnndim, posfflayerdim=args.posfflayerdim,sbdrnnbidirectional=args.sbdrnnbidirectional, + posrnnbidirectional=args.posrnnbidirectional, sbdrnnnumlayers=args.sbdrnnnumlayers, + posrnnnumlayers=args.posrnnnumlayers, sbdencodertype=args.sbdencodertype, + posencodertype=args.posencodertype + , learningrate=args.lr, batchsize=args.trainbatch, sequencelength=args.seqlen, + dropout=args.dropout, wordropout=args.worddropout, lockeddropout=args.lockeddropout,datatype=args.datatype) tagger.prepare_data_files() + #tagger.train(checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/htb_best_sent_pos_model_13.316283_0.979424_0.98009.pt') tagger.train() From c76bc6eb6d586996b33dd2512882f4bbf825a9d8 Mon Sep 17 00:00:00 2001 From: nitin Date: Thu, 4 Aug 2022 03:04:06 +0800 Subject: [PATCH 16/32] bugfix --- hebpipe/heb_pipe.py | 1 + hebpipe/lib/crfutils/viterbi.py | 22 +++++++++++-------- .../lib/multitask_sentsplitter_postagger.py | 6 +---- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index c05dfc1..d22e487 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -603,6 +603,7 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, # Wiki mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=256,sbdrnndim=512,posrnndim=512,sbdfflayerdim=512) + # HTB #mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=320,sbdrnndim=256,posrnndim=512,sbdfflayerdim=256) diff --git a/hebpipe/lib/crfutils/viterbi.py b/hebpipe/lib/crfutils/viterbi.py index 75f7f56..858f845 100644 --- a/hebpipe/lib/crfutils/viterbi.py +++ b/hebpipe/lib/crfutils/viterbi.py @@ -5,8 +5,6 @@ import torch.nn from torch.nn.functional import softmax from torch.nn.utils.rnn import pack_padded_sequence - -import flair from flair.data import Dictionary, Label, List, Sentence START_TAG: str = "" @@ -28,6 +26,9 @@ def __init__(self, tag_dictionary: Dictionary): self.start_tag = tag_dictionary.get_idx_for_item(START_TAG) self.stop_tag = tag_dictionary.get_idx_for_item(STOP_TAG) + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor: """ Forward propagation of Viterbi Loss @@ -42,7 +43,7 @@ def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor: seq_len = features.size(1) targets, targets_matrix_indices = self._format_targets(targets, lengths) - targets_matrix_indices = torch.tensor(targets_matrix_indices, dtype=torch.long).unsqueeze(2).to(flair.device) + targets_matrix_indices = torch.tensor(targets_matrix_indices, dtype=torch.long).unsqueeze(2).to(self.device) # scores_at_targets[range(features.shape[0]), lengths.values -1] # Squeeze crf scores matrices in 1-dim shape and gather scores at targets by matrix indices @@ -54,7 +55,7 @@ def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor: ] gold_score = scores_at_targets.sum() + transitions_to_stop.sum() - scores_upto_t = torch.zeros(batch_size, self.tagset_size, device=flair.device) + scores_upto_t = torch.zeros(batch_size, self.tagset_size, device=self.device) for t in range(max(lengths)): batch_size_t = sum( @@ -137,6 +138,9 @@ def __init__(self, tag_dictionary: Dictionary): self.start_tag = tag_dictionary.get_idx_for_item(START_TAG) self.stop_tag = tag_dictionary.get_idx_for_item(STOP_TAG) + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + def decode( self, features_tuple: tuple, probabilities_for_all_classes: bool, sentences: List[Sentence] ) -> Tuple[List, List]: @@ -154,12 +158,12 @@ def decode( seq_len = features.size(1) # Create a tensor to hold accumulated sequence scores at each current tag - scores_upto_t = torch.zeros(batch_size, seq_len + 1, self.tagset_size).to(flair.device) + scores_upto_t = torch.zeros(batch_size, seq_len + 1, self.tagset_size).to(self.device) # Create a tensor to hold back-pointers # i.e., indices of the previous_tag that corresponds to maximum accumulated score at current tag # Let pads be the tag index, since that was the last tag in the decoded sequence backpointers = ( - torch.ones((batch_size, seq_len + 1, self.tagset_size), dtype=torch.long, device=flair.device) + torch.ones((batch_size, seq_len + 1, self.tagset_size), dtype=torch.long, device=self.device) * self.stop_tag ) @@ -186,8 +190,8 @@ def decode( ) # Decode/trace best path backwards - decoded = torch.zeros((batch_size, backpointers.size(1)), dtype=torch.long, device=flair.device) - pointer = torch.ones((batch_size, 1), dtype=torch.long, device=flair.device) * self.stop_tag + decoded = torch.zeros((batch_size, backpointers.size(1)), dtype=torch.long, device=self.device) + pointer = torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * self.stop_tag for t in list(reversed(range(backpointers.size(1)))): decoded[:, t] = torch.gather(backpointers[:, t, :], 1, pointer).squeeze(1) @@ -195,7 +199,7 @@ def decode( # Sanity check assert torch.equal( - decoded[:, 0], torch.ones((batch_size), dtype=torch.long, device=flair.device) * self.start_tag + decoded[:, 0], torch.ones((batch_size), dtype=torch.long, device=self.device) * self.start_tag ) # remove start-tag and backscore to stop-tag diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/multitask_sentsplitter_postagger.py index e11436d..b2c3ee7 100644 --- a/hebpipe/lib/multitask_sentsplitter_postagger.py +++ b/hebpipe/lib/multitask_sentsplitter_postagger.py @@ -166,12 +166,11 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= elif 'weight' in name: nn.init.xavier_normal_(param) - #self.sigmoid = nn.Sigmoid() self.dropout = nn.Dropout(dropout) self.worddropout = WordDropout(wordropout) self.lockeddropout = LockedDropout(lockeddropout) - self.poscrf = CRF(self.postagsetcrf,len(self.postagsetcrf),init_from_state_dict=False) # TODO: parameterize + self.poscrf = CRF(self.postagsetcrf,len(self.postagsetcrf),init_from_state_dict=False).to(self.device) # TODO: parameterize self.viterbidecoder = ViterbiDecoder(self.postagsetcrf) for name, param in self.poscrf.named_parameters(): @@ -294,9 +293,6 @@ def forward(self,data,mode='train'): embeddings = torch.div(scalarsum,self.lastn) #embeddings = embeddings.to(self.device) - #embeddings = embeddings[0] - #embeddings = embeddings.to(self.device) - """ Average the subword embeddings within the horizontal sequence. This process will drop the [CLS],[SEP] and [PAD] tokens From 51dbb359155740ed0a92fa995ebc15b3227347af Mon Sep 17 00:00:00 2001 From: nitin Date: Mon, 8 Aug 2022 23:42:15 +0800 Subject: [PATCH 17/32] sbd plus pos. branching --- hebpipe/heb_pipe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index d22e487..195efa7 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -602,10 +602,10 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, input_data = input_data.replace("|","") # Wiki - mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=256,sbdrnndim=512,posrnndim=512,sbdfflayerdim=512) + mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=256,sbdrnndim=256,posrnndim=512,sbdfflayerdim=256) # HTB - #mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=320,sbdrnndim=256,posrnndim=512,sbdfflayerdim=256) + #mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=256,sbdrnndim=256,posrnndim=512,sbdfflayerdim=256) if preloaded is not None: @@ -678,7 +678,7 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, """ sent_tag = 's' - tagged_conllu, tokenized = mtltagger.split_pos(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/top_wiki_best_sent_pos_model_17.477738_0.857963_0.972323.pt') + tagged_conllu, tokenized = mtltagger.split_pos(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/top_wiki_best_sent_pos_model_18.45584_0.883117_0.971578.pt') pos_tags = [l.split("\t")[3] for l in tagged_conllu.split("\n") if "\t" in l] del mtltagger From a7847b2f0b76694105d2f8b5ee6d81a13e5e953e Mon Sep 17 00:00:00 2001 From: nitin Date: Tue, 9 Aug 2022 00:24:07 +0800 Subject: [PATCH 18/32] rename --- hebpipe/lib/{multitask_sentsplitter_postagger.py => mtlmodel.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename hebpipe/lib/{multitask_sentsplitter_postagger.py => mtlmodel.py} (100%) diff --git a/hebpipe/lib/multitask_sentsplitter_postagger.py b/hebpipe/lib/mtlmodel.py similarity index 100% rename from hebpipe/lib/multitask_sentsplitter_postagger.py rename to hebpipe/lib/mtlmodel.py From 0bad4d0767320025c3d86c8bf13679d6c9893024 Mon Sep 17 00:00:00 2001 From: nitin Date: Tue, 16 Aug 2022 02:24:17 +0800 Subject: [PATCH 19/32] feats --- hebpipe/heb_pipe.py | 4 +- hebpipe/lib/mtlmodel.py | 208 ++++++++++++++++++++++++++++++++-------- 2 files changed, 168 insertions(+), 44 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 195efa7..85f621c 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -27,7 +27,7 @@ from .lib.whitespace_tokenize import add_space_after, tokenize as whitespace_tokenize from .lib.flair_sent_splitter import FlairSentSplitter from .lib.flair_pos_tagger import FlairTagger - from .lib.multitask_sentsplitter_postagger import Tagger + from .lib.mtlmodel import Tagger except ImportError: # direct script usage from lib.xrenner import Xrenner from lib._version import __version__ @@ -37,7 +37,7 @@ from lib.whitespace_tokenize import add_space_after, tokenize as whitespace_tokenize from lib.flair_sent_splitter import FlairSentSplitter from lib.flair_pos_tagger import FlairTagger - from lib.multitask_sentsplitter_postagger import Tagger + from lib.mtlmodel import Tagger PY3 = sys.version_info[0] > 2 diff --git a/hebpipe/lib/mtlmodel.py b/hebpipe/lib/mtlmodel.py index b2c3ee7..74ccc25 100644 --- a/hebpipe/lib/mtlmodel.py +++ b/hebpipe/lib/mtlmodel.py @@ -43,7 +43,7 @@ def __init__(self, start, end): class MTLModel(nn.Module): - def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers=1,posfflayerdim=512,sbdrnnbidirectional=True,posrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posencodertype='lstm',batchsize=16,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): + def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnnnumlayers=1,posmorphfflayerdim=512,sbdrnnbidirectional=True,posmorphrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posmorphencodertype='lstm',batchsize=16,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -60,6 +60,18 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= self.postagsetcrf.add_item("") self.postagsetcrf.add_item("") + # FEATS dictionary + # IMPORTANT: This should be sorted by key + self.featstagset = {'Abbr=Yes':0, 'Aspect=Prog':1, 'Case=Acc':2, 'Case=Gen':3, 'Definite=Com':4, 'Definite=Cons':5, 'Definite=Def':6, 'Definite=Ind':7, 'Definite=Spec':8, + 'Foreign=Yes':9, 'Gender=Fem':10, 'Gender=Masc':11, 'HebBinyan=HIFIL':12, 'HebBinyan=HITPAEL':13, 'HebBinyan=HUFAL':14, 'HebBinyan=NIFAL':15, + 'HebBinyan=NITPAEL':16, 'HebBinyan=PAAL':17, 'HebBinyan=PIEL':18, 'HebBinyan=PUAL':19, 'Mood=Imp':20, + 'NumType=Card':21, 'NumType=Ord':22, 'Number=Dual':23, 'Number=Plur':24, 'Number=Sing':25, 'Person=1':26, + 'Person=2':27, 'Person=3':28, 'Polarity=Neg':29, 'Polarity=Pos':30, 'Poss=Yes':31, 'Prefix=Yes':32, 'PronType=Art':33, 'PronType=Dem':34, + 'PronType=Emp':35, 'PronType=Ind':36, 'PronType=Int':37, 'PronType=Prs':38, 'Reflex=Yes':39, 'Tense=Fut':40, 'Tense=Past':41, 'Tense=Pres':42, + 'Typo=Yes':43, 'VerbForm=Inf':44, 'VerbForm=Part':45, 'VerbType=Cop':46, 'VerbType=Mod':47, 'Voice=Act':48, 'Voice=Mid':49, 'Voice=Pass':50} + + self.idxtofeatstagset = {v:k for k,v in self.featstagset.items()} + # shared hyper-parameters self.sequence_length = sequencelength self.batch_size = batchsize @@ -80,9 +92,9 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= self.sbdrnnbidirectional = sbdrnnbidirectional #Bi-LSTM Encoder for POS tagging - self.posrnndim = posrnndim - self.posrnnnumlayers = posrnnnumlayers - self.posrnnbidirectional = posrnnbidirectional + self.posmorphrnndim = posmorphrnndim + self.posmorphrnnnumlayers = posmorphrnnnumlayers + self.posmorphrnnbidirectional = posmorphrnnbidirectional if sbdencodertype == 'lstm': self.sbdencoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.sbdrnndim // 2, @@ -103,17 +115,17 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= except ValueError as ex: nn.init.constant_(param,0.0) - if posencodertype == 'lstm': - self.posencoder = nn.LSTM(input_size=self.embeddingdim + 5, hidden_size=self.posrnndim // 2, - num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, + if posmorphencodertype == 'lstm': + self.posmorphencoder = nn.LSTM(input_size=self.embeddingdim + 5, hidden_size=self.posmorphrnndim // 2, + num_layers=self.posmorphrnnnumlayers, bidirectional=self.posmorphrnnbidirectional, batch_first=True).to(self.device) - elif posencodertype == 'gru': - self.posencoder = nn.GRU(input_size=self.embeddingdim + 5, hidden_size=self.posrnndim // 2, - num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, + elif posmorphencodertype == 'gru': + self.posmorphencoder = nn.GRU(input_size=self.embeddingdim + 5, hidden_size=self.posmorphrnndim // 2, + num_layers=self.posmorphrnnnumlayers, bidirectional=self.posmorphrnnbidirectional, batch_first=True).to(self.device) # param init - for name, param in self.posencoder.named_parameters(): + for name, param in self.posmorphencoder.named_parameters(): try: if 'bias' in name: nn.init.constant_(param, 0.0) @@ -137,19 +149,19 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= nn.init.xavier_normal_(param) # Intermediate feedforward layer - self.posembedding2nn = nn.Linear(in_features=self.embeddingdim + 5,out_features=self.embeddingdim + 5).to(self.device) - self.posfflayerdim = posfflayerdim - self.posfflayer = nn.Linear(in_features=self.posrnndim, out_features=self.posfflayerdim).to(self.device) + self.posmorphembedding2nn = nn.Linear(in_features=self.embeddingdim + 5,out_features=self.embeddingdim + 5).to(self.device) + self.posmorphfflayerdim = posmorphfflayerdim + self.posmorphfflayer = nn.Linear(in_features=self.posmorphrnndim, out_features=self.posmorphfflayerdim).to(self.device) # param init - for name, param in self.posembedding2nn.named_parameters(): + for name, param in self.posmorphembedding2nn.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_normal_(param) # Label space for the pos tagger - self.hidden2postag = nn.Linear(in_features=self.posfflayerdim,out_features=len(self.postagsetcrf)).to(self.device) + self.hidden2postag = nn.Linear(in_features=self.posmorphfflayerdim,out_features=len(self.postagsetcrf)).to(self.device) for name, param in self.hidden2postag.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) @@ -158,7 +170,6 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= # Label space for sent splitter self.hidden2sbd = nn.Linear(in_features=self.sbdfflayerdim,out_features=len(self.sbd_tag2idx.keys())).to(self.device) - # param init for name, param in self.hidden2sbd.named_parameters(): if 'bias' in name: @@ -166,6 +177,15 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= elif 'weight' in name: nn.init.xavier_normal_(param) + + # label space for morph feats + self.hidden2feats = nn.Linear(in_features=self.posmorphfflayerdim,out_features=len(self.featstagset)).to(self.device) + for name, param in self.hidden2feats.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + self.dropout = nn.Dropout(dropout) self.worddropout = WordDropout(wordropout) self.lockeddropout = LockedDropout(lockeddropout) @@ -181,7 +201,9 @@ def __init__(self,sbdrnndim=128,posrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers= self.stride_size = 10 self.sbdencodertype = sbdencodertype - self.posencodertype = posencodertype + self.posmorphencodertype = posmorphencodertype + + self.sigmoid = nn.Sigmoid() def shingle(self,toks,labels=None): """ @@ -238,6 +260,7 @@ def shingle(self,toks,labels=None): def forward(self,data,mode='train'): badrecords = [] # stores records where AlephBERT's tokenization 'messed up' the sentence's sequence length, and removes these sentences from the batch. + featslabels = None # Extract the sentences and labels if mode == 'train': # training is on a batch, so 3D tensor @@ -246,13 +269,34 @@ def forward(self,data,mode='train'): poslabels = [[self.postagsetcrf.get_idx_for_item(s.split('\t')[1].strip()) for s in sls] for sls in data] supertokenlabels = [] + featslabels = [] + for sls in data: record = [] + featsrecord = [] for s in sls: temp = [0] * len(self.supertokenset) - temp[self.supertokenset[s.split('\t')[-1].strip()]] = 1 + temp[self.supertokenset[s.split('\t')[3].strip()]] = 1 record.append(temp) + + tempfeats = [0] * len(self.featstagset) + fts = s.split('\t')[-1].strip() + if fts != '': + fts = fts.split('|') + for f in fts: + key = f.split('=')[0] + value = f.split('=')[1] + if ',' not in value: + tempfeats[self.featstagset[f]] = 1 + else: + value = value.split(',') + for v in value: + tempfeats[self.featstagset[key + '=' + v]] = 1 + + featsrecord.append(tempfeats) + supertokenlabels.append(record) + featslabels.append(featsrecord) elif mode == 'dev': # inference is on a single record sentences = [' '.join([s.split('\t')[0].strip() for s in data])] @@ -260,11 +304,28 @@ def forward(self,data,mode='train'): poslabels = [self.postagsetcrf.get_idx_for_item(s.split('\t')[1].strip()) for s in data] supertokenlabels = [] + featslabels = [] for s in data: temp = [0] * len(self.supertokenset) - temp[self.supertokenset[s.split('\t')[-1].strip()]] = 1 + temp[self.supertokenset[s.split('\t')[3].strip()]] = 1 supertokenlabels.append(temp) + tempfeats = [0] * len(self.featstagset) + fts = s.split('\t')[-1].strip() + if fts != '': + fts = fts.split('|') + for f in fts: + key = f.split('=')[0] + value = f.split('=')[1] + if ',' not in value: + tempfeats[self.featstagset[f]] = 1 + else: + value = value.split(',') + for v in value: + tempfeats[self.featstagset[key + '=' + v]] = 1 + + featslabels.append(tempfeats) + else: # test - a tuple of text and supertoken labels sentences = [' '.join([s.split('\t')[0].strip() for s in data[0]])] @@ -399,7 +460,7 @@ def forward(self,data,mode='train'): supertokenlabels = torch.unsqueeze(supertokenlabels,dim=0) # Add the SBD predictions to the POS Encoder Input! - posembeddings = torch.cat((avgembeddings,sbdpreds,supertokenlabels),dim=2) + posmorphembeddings = torch.cat((avgembeddings,sbdpreds,supertokenlabels),dim=2) if mode in ('dev','test'): # Squeeze these to return to the Trainer for scores, now that we are done with them @@ -409,13 +470,13 @@ def forward(self,data,mode='train'): else: sbdpreds = None - posembeddings = self.dropout(posembeddings) - posembeddings = self.worddropout(posembeddings) - posembeddings = self.lockeddropout(posembeddings) - posembeddings = self.posembedding2nn(posembeddings) + posmorphembeddings = self.dropout(posmorphembeddings) + posmorphembeddings = self.worddropout(posmorphembeddings) + posmorphembeddings = self.lockeddropout(posmorphembeddings) + posmorphembeddings = self.posmorphembedding2nn(posmorphembeddings) - feats,_ = self.posencoder(posembeddings) - feats = self.posfflayer(feats) + feats,_ = self.posmorphencoder(posmorphembeddings) + feats = self.posmorphfflayer(feats) feats = self.relu(feats) feats = self.dropout(feats) feats = self.lockeddropout(feats) @@ -424,12 +485,15 @@ def forward(self,data,mode='train'): poslogits = self.hidden2postag(feats) poslogits = self.poscrf(poslogits) - return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels # returns the logits and labels + # logits for morphs + featslogits = self.hidden2feats(feats) + + return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels, featslogits,featslabels # returns the logits and labels class Tagger(): def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=128,sbdrnnnumlayers=1,sbdrnnbidirectional=True,sbdfflayerdim=256,posrnndim=256,posrnnnumlayers=1,posrnnbidirectional=True,posfflayerdim=512,dropout=0.05,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',posencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/',batchsize=32,sequencelength=256,datatype='htb'): - self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posrnndim=posrnndim,posrnnbidirectional=posrnnbidirectional,posencodertype=posencodertype,posrnnnumlayers=posrnnnumlayers,posfflayerdim=posfflayerdim,batchsize=batchsize,sequencelength=sequencelength) + self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posmorphrnndim=posrnndim,posmorphrnnbidirectional=posrnnbidirectional,posmorphencodertype=posencodertype,posmorphrnnnumlayers=posrnnnumlayers,posmorphfflayerdim=posfflayerdim,batchsize=batchsize,sequencelength=sequencelength) if trainflag == True: @@ -465,12 +529,18 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr self.sbdloss = nn.CrossEntropyLoss(weight=torch.FloatTensor([1,3])) self.sbdloss.to(self.device) + self.featsloss = nn.BCEWithLogitsLoss() + self.featsloss.to(self.device) + self.optimizer = torch.optim.AdamW(list(self.mtlmodel.sbdencoder.parameters()) + list(self.mtlmodel.sbdembedding2nn.parameters()) + - list(self.mtlmodel.hidden2sbd.parameters()) + list(self.mtlmodel.posencoder.parameters()) + list(self.mtlmodel.posembedding2nn.parameters()) - + list(self.mtlmodel.hidden2postag.parameters()) + list(self.mtlmodel.poscrf.parameters()), lr=learningrate) + list(self.mtlmodel.hidden2sbd.parameters()) + list(self.mtlmodel.posmorphencoder.parameters()) + list(self.mtlmodel.posmorphembedding2nn.parameters()) + + list(self.mtlmodel.hidden2postag.parameters()) + list(self.mtlmodel.poscrf.parameters()) + + list(self.mtlmodel.hidden2feats.parameters()), lr=learningrate) self.scheduler = torch.optim.lr_scheduler.CyclicLR(self.optimizer,base_lr=learningrate/10,max_lr=learningrate,step_size_up=250,cycle_momentum=False) - self.evalstep = 50 + self.evalstep = 1 + + self.sigmoidthreshold = 0.5 def set_seed(self, seed): @@ -540,7 +610,7 @@ def read_file(mode='train'): data = [datum for datum in data if len(datum) == self.mtlmodel.sequence_length] self.mtlmodel.batch_size = len(data) - sbdlogits, sbdlabels, _, poslogits,poslabels = self.mtlmodel(data) + sbdlogits, sbdlabels, _, poslogits,poslabels, featslogits, featslabels = self.mtlmodel(data) sbdtags = torch.LongTensor(sbdlabels).to(self.device) sbdlogits = sbdlogits.permute(0,2,1) @@ -555,8 +625,10 @@ def read_file(mode='train'): postags = torch.LongTensor(postags).to(self.device) posloss = self.postagloss(scores,postags) + featstags = torch.FloatTensor(featslabels).to(self.device) + featsloss = self.featsloss(featslogits,featstags) - mtlloss = posloss + sbdloss # TODO: learnable weights? + mtlloss = posloss + sbdloss + featsloss # TODO: learnable weights? mtlloss.backward() self.optimizer.step() self.scheduler.step() @@ -566,6 +638,7 @@ def read_file(mode='train'): self.writer.add_scalar('train_pos_loss', posloss.item(), epoch) self.writer.add_scalar('train_sbd_loss', sbdloss.item(), epoch) + self.writer.add_scalar('train_feats_loss', featsloss.item(), epoch) self.writer.add_scalar('train_joint_loss', mtlloss.item(), epoch) """"""""""""""""""""""""""""""""""""""""""""" @@ -579,13 +652,14 @@ def read_file(mode='train'): totalsbddevloss = 0 totalposdevloss = 0 + totalfeatsdevloss = 0 allsbdpreds = [] allsbdgold = [] allpospreds = [] allposgold = [] - - start = time() + allfeatsgold = [] + allfeatspreds = [] # because of shingling for SBD, the dev data needs to be split in slices for inference, as GPU may run out of memory with shingles on the full token list. # shingling and SBD prediction is done on the individual slice, as well as POS tag predictions. @@ -604,9 +678,10 @@ def read_file(mode='train'): goldsbdlabels = [self.mtlmodel.sbd_tag2idx[s] for s in goldsbdlabels] goldposlabels = [s.split('\t')[1].strip() for s in slice] goldposlabels = [self.mtlmodel.postagsetcrf.get_idx_for_item(s) for s in goldposlabels] + goldfeatslabels = [s.split('\t')[4].strip() for s in slice] # sbdpreds already contains the sbd predictions. These were necessary for input to the POS encoder. - sbdlogits, sbdlabels, sbdpreds, poslogits, poslabels = self.mtlmodel(slice,mode='dev') + sbdlogits, sbdlabels, sbdpreds, poslogits, poslabels, featslogits, featslabels = self.mtlmodel(slice,mode='dev') # get the pos predictions lengths = [self.mtlmodel.sequence_length] @@ -615,25 +690,56 @@ def read_file(mode='train'): pospreds = self.mtlmodel.viterbidecoder.decode(scores,False,[sentence]) pospreds = [self.mtlmodel.postagsetcrf.get_idx_for_item(p[0]) for pr in pospreds[0] for p in pr] + # get the feats predictions + featspreds = self.mtlmodel.sigmoid(featslogits) + featspreds = (featspreds > self.sigmoidthreshold).long() + featspreds = torch.squeeze(featspreds).tolist() + # get the sbd loss sbdlogits = sbdlogits.permute(0,2,1) sbdtags = torch.LongTensor(sbdlabels).to(self.device) - sbddevloss = self.sbdloss(sbdlogits, sbdtags).item() - # get the pos loss postags = torch.LongTensor(poslabels) postags = postags.to(self.device) posdevloss = self.postagloss(scores,postags).item() + # get the feats loss + featstags = torch.FloatTensor(featslabels) + featstags = featstags.to(self.device) + featstags = torch.unsqueeze(featstags,dim=0) + featsdevloss = self.featsloss(featslogits,featstags).item() + totalsbddevloss += sbddevloss totalposdevloss += posdevloss + totalfeatsdevloss += featsdevloss + + # build the feats tags for the sequence + featsslicepreds = [] + for preds in featspreds: + featsstr = '' + for i in range(0,len(preds)): + if preds[i] != 0: + #if i == 0: + # featsstr += self.mtlmodel.idxtofeatstagset[i] + #else: + if self.mtlmodel.idxtofeatstagset[i].split('=')[0] == featsstr.split('|')[-1].split('=')[0]: + featsstr += ',' + self.mtlmodel.idxtofeatstagset[i].split('=')[1] + else: + if featsstr != '': + featsstr = featsstr + '|' + self.mtlmodel.idxtofeatstagset[i] + else: + featsstr += self.mtlmodel.idxtofeatstagset[i] + + featsslicepreds.append(featsstr) allsbdpreds.extend(sbdpreds) allsbdgold.extend(goldsbdlabels) allpospreds.extend(pospreds) allposgold.extend(goldposlabels) + allfeatsgold.extend(goldfeatslabels) + allfeatspreds.extend(featsslicepreds) #print ('inference time') #print (time() - start) @@ -658,7 +764,10 @@ def read_file(mode='train'): correctpos = sum([1 if p == g else 0 for p,g in zip(allpospreds,allposgold)]) posscores = Score(len(allposgold),len(allpospreds),correctpos,len(allpospreds)) - mtlloss = (totalsbddevloss + totalposdevloss) / len(devdata) + correctfeats = sum([1 if p == g else 0 for p,g in zip(allfeatspreds,allfeatsgold)]) + featsscores = Score(len(allfeatsgold),len(allfeatspreds),correctfeats,len(allfeatspreds)) + + mtlloss = (totalsbddevloss + totalposdevloss + totalfeatsdevloss) / len(devdata) self.writer.add_scalar("mtl_dev_loss", round(mtlloss, 4), int(epoch / self.evalstep)) @@ -677,6 +786,11 @@ def read_file(mode='train'): int(epoch / self.evalstep)) self.writer.add_scalar("pos_dev_recall", round(posscores.recall, 4), int(epoch / self.evalstep)) + self.writer.add_scalar("feats_dev_loss",round(totalfeatsdevloss / len(devdata),4), int(epoch / self.evalstep)) + self.writer.add_scalar("feats_dev_f1",round(featsscores.f1,4),int(epoch / self.evalstep)) + self.writer.add_scalar("feats_dev_precision",round(featsscores.precision,4),int(epoch / self.evalstep)) + self.writer.add_scalar("feats_dev_recall", round(featsscores.recall, 4),int(epoch / self.evalstep)) + print ('sbd dev f1:' + str(sbdscores.f1)) print('sbd dev precision:' + str(sbdscores.precision)) print('sbd dev recall:' + str(sbdscores.recall)) @@ -687,6 +801,11 @@ def read_file(mode='train'): print('pos dev recall:' + str(posscores.recall)) print('\n') + print('feats dev f1:' + str(featsscores.f1)) + print('feats dev precision:' + str(featsscores.precision)) + print('feats dev recall:' + str(featsscores.recall)) + print('\n') + if mtlloss < bestloss: bestloss = mtlloss bestmodel = self.bestmodel.replace('.pt','_' + str(round(mtlloss,6)) + '_' + str(round(sbdscores.f1,6)) + '_' + str(round(posscores.f1,6)) + '.pt') @@ -825,11 +944,16 @@ def write_file(filename,mode='train'): elif length == -1: supertoken = 'O' + if sent[i]['feats'] is not None: + feats = '|'.join(k + '=' + v for k,v in sent[i]['feats'].items()) + else: + feats = '' + if sent[i]['id'] == 1: - tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'B-SENT' + '\t' + supertoken + '\n') + tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'B-SENT' + '\t' + supertoken + '\t' + feats + '\n') else: - tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'O' + '\t' + supertoken + '\n') + tr.write(sent[i]['form'] + '\t' + sent[i]['upos'] + '\t' + 'O' + '\t' + supertoken + '\t' + feats + '\n') i += 1 From a89aafa7f2bc3c72c296b8f5d7eda93295a25085 Mon Sep 17 00:00:00 2001 From: nitin Date: Wed, 17 Aug 2022 02:02:56 +0800 Subject: [PATCH 20/32] integrated feats --- hebpipe/heb_pipe.py | 80 ++++++------- hebpipe/lib/mtlmodel.py | 254 ++++++++++++++++++++++++++++------------ 2 files changed, 216 insertions(+), 118 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 85f621c..319a6de 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -609,21 +609,11 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, if preloaded is not None: - - rf_tok, xrenner, flair_sent_splitter, parser, tagger, morpher, lemmatizer = preloaded + rf_tok, xrenner, _, parser, _, _, lemmatizer = preloaded else: rf_tok = RFTokenizer(model=model_dir + "heb.sm" + str(sys.version_info[0])) xrenner = Xrenner(model=model_dir + "heb.xrm") - - """ - if sent_tag == "auto" and not punct_sentencer: - flair_sent_splitter = FlairSentSplitter(model_path=model_dir + "heb.sent") - else: - flair_sent_splitter = None - """ parser = None if not do_parse else Parser.load(model_dir + "heb.diaparser",verbose=False) - #tagger = None if not do_tag else FlairTagger() - morpher = None if not do_tag else FlairTagger(morph=True) lemmatizer = None if not do_lemma and not do_tag else init_lemmatizer() if do_whitespace: @@ -678,13 +668,26 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, """ sent_tag = 's' - tagged_conllu, tokenized = mtltagger.split_pos(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/top_wiki_best_sent_pos_model_18.45584_0.883117_0.971578.pt') - pos_tags = [l.split("\t")[3] for l in tagged_conllu.split("\n") if "\t" in l] + tagged_conllu, tokenized, morphs, words = mtltagger.predict(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/wiki_best_sent_pos_model_19.233182_0.864943_0.971897_0.710666.pt') del mtltagger del rf_tok torch.cuda.empty_cache() + zeros = ["0" for i in range(len(morphs))] + zero_conllu = inject_col(zeros, tagged_conllu, into_col=6, skip_supertoks=True) + lemmas = lemmatize(lemmatizer, zero_conllu, morphs) + tagged = inject_col(tagged_conllu, tokenized, 4) + + if do_lemma: + lemmatized = inject_col(lemmas, tagged, -1) + else: + lemmatized = tagged + + morphs = postprocess_morph(morphs, words, lemmas) + morphed = inject_col(morphs, lemmatized, -1) + + """ if do_tag: #morpher = None @@ -735,34 +738,31 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, morphs = postprocess_morph(morphs, words, lemmas) morphed = inject_col(morphs,lemmatized,-1) + """ - if not do_parse: - if out_mode == "conllu": - conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True) - conllized = add_space_after(input_data,conllized) - return conllized - else: - if not PY3: - morphed = morphed.decode("utf8") - return morphed - - else: + if not do_parse: if out_mode == "conllu": - conllized = conllize(tokenized, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, attrs_as_comments=True) - conllized = add_space_after(input_data, conllized) + conllized = add_space_after(input_data,conllized) return conllized else: - return tokenized + if not PY3: + morphed = morphed.decode("utf8") + return morphed - if do_parse: - if filecount == 1: - # Free up GPU memory if no more files need it - del morpher - del tagger - torch.cuda.empty_cache() + """ + else: + if out_mode == "conllu": + conllized = conllize(tokenized, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + attrs_as_comments=True) + conllized = add_space_after(input_data, conllized) + return conllized + else: + return tokenized + """ + else: conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, attrs_as_comments=True, ten_cols=True) parsed = diaparse(parser, conllized) @@ -781,15 +781,6 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, else: parsed = add_space_after(input_data,parsed) return parsed - else: - if out_mode == "conllu": - conllized = conllize(tagged, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True) - conllized = add_space_after(input_data, conllized) - return conllized - else: - return tagged - def run_hebpipe(): @@ -891,7 +882,8 @@ def run_hebpipe(): sys.exit(0) #tagger = FlairTagger() tagger = None - morpher = FlairTagger(morph=True) + #morpher = FlairTagger(morph=True) + morpher = None lemmatizer = init_lemmatizer(cpu=opts.cpu, no_post_process=opts.disable_lex) else: tagger = None diff --git a/hebpipe/lib/mtlmodel.py b/hebpipe/lib/mtlmodel.py index 74ccc25..6dab30f 100644 --- a/hebpipe/lib/mtlmodel.py +++ b/hebpipe/lib/mtlmodel.py @@ -43,7 +43,7 @@ def __init__(self, start, end): class MTLModel(nn.Module): - def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnnnumlayers=1,posmorphfflayerdim=512,sbdrnnbidirectional=True,posmorphrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posmorphencodertype='lstm',batchsize=16,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): + def __init__(self,sbdrnndim=128,posrnndim=256,morphrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers=1,morphrnnnumlayers=1,posfflayerdim=512,morphfflayerdim=512,sbdrnnbidirectional=True,posrnnbidirectional=True,morphrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posencodertype='lstm',morphencodertype='lstm',batchsize=16,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -60,6 +60,7 @@ def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnn self.postagsetcrf.add_item("") self.postagsetcrf.add_item("") + """ # FEATS dictionary # IMPORTANT: This should be sorted by key self.featstagset = {'Abbr=Yes':0, 'Aspect=Prog':1, 'Case=Acc':2, 'Case=Gen':3, 'Definite=Com':4, 'Definite=Cons':5, 'Definite=Def':6, 'Definite=Ind':7, 'Definite=Spec':8, @@ -70,8 +71,15 @@ def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnn 'PronType=Emp':35, 'PronType=Ind':36, 'PronType=Int':37, 'PronType=Prs':38, 'Reflex=Yes':39, 'Tense=Fut':40, 'Tense=Past':41, 'Tense=Pres':42, 'Typo=Yes':43, 'VerbForm=Inf':44, 'VerbForm=Part':45, 'VerbType=Cop':46, 'VerbType=Mod':47, 'Voice=Act':48, 'Voice=Mid':49, 'Voice=Pass':50} - self.idxtofeatstagset = {v:k for k,v in self.featstagset.items()} + """ + + # {"Gender","Number","Tense","VerbForm","Voice","HebBinyan","Definite"} + self.featstagset = {'Definite=Com':0, 'Definite=Cons':1, 'Definite=Def':2, 'Definite=Ind':3, 'Definite=Spec':4, + 'Gender=Fem':5, 'Gender=Masc':6, 'HebBinyan=HIFIL':7, 'HebBinyan=HITPAEL':8, 'HebBinyan=HUFAL':9, 'HebBinyan=NIFAL':10, + 'HebBinyan=NITPAEL':11, 'HebBinyan=PAAL':12, 'HebBinyan=PIEL':13, 'HebBinyan=PUAL':14, 'Number=Dual':15, 'Number=Plur':16, 'Number=Sing':17, + 'Tense=Fut':18, 'Tense=Past':19, 'Tense=Pres':20,'VerbForm=Inf':21, 'VerbForm=Part':22,'Voice=Act':23, 'Voice=Mid':24, 'Voice=Pass':25} + self.idxtofeatstagset = {v: k for k, v in self.featstagset.items()} # shared hyper-parameters self.sequence_length = sequencelength self.batch_size = batchsize @@ -92,9 +100,14 @@ def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnn self.sbdrnnbidirectional = sbdrnnbidirectional #Bi-LSTM Encoder for POS tagging - self.posmorphrnndim = posmorphrnndim - self.posmorphrnnnumlayers = posmorphrnnnumlayers - self.posmorphrnnbidirectional = posmorphrnnbidirectional + self.posrnndim = posrnndim + self.posrnnnumlayers = posrnnnumlayers + self.posrnnbidirectional = posrnnbidirectional + + # Encoder for feats + self.morphrnndim = morphrnndim + self.morphrnnnumlayers = morphrnnnumlayers + self.morphrnnbidirectional = morphrnnbidirectional if sbdencodertype == 'lstm': self.sbdencoder = nn.LSTM(input_size=self.embeddingdim, hidden_size=self.sbdrnndim // 2, @@ -115,17 +128,36 @@ def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnn except ValueError as ex: nn.init.constant_(param,0.0) - if posmorphencodertype == 'lstm': - self.posmorphencoder = nn.LSTM(input_size=self.embeddingdim + 5, hidden_size=self.posmorphrnndim // 2, - num_layers=self.posmorphrnnnumlayers, bidirectional=self.posmorphrnnbidirectional, + if posencodertype == 'lstm': + self.posencoder = nn.LSTM(input_size=self.embeddingdim + len(self.supertokenset) + 1, hidden_size=self.posrnndim // 2, + num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, + batch_first=True).to(self.device) + elif posencodertype == 'gru': + self.posencoder = nn.GRU(input_size=self.embeddingdim + len(self.supertokenset) + 1 , hidden_size=self.posrnndim // 2, + num_layers=self.posrnnnumlayers, bidirectional=self.posrnnbidirectional, + batch_first=True).to(self.device) + + # param init + for name, param in self.posencoder.named_parameters(): + try: + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_uniform_(param) + except ValueError as ex: + nn.init.constant_(param, 0.0) + + if morphencodertype == 'lstm': + self.morphencoder = nn.LSTM(input_size=self.embeddingdim + len(self.postagset) + len(self.supertokenset) + 1, hidden_size=self.morphrnndim // 2, + num_layers=self.morphrnnnumlayers, bidirectional=self.morphrnnbidirectional, batch_first=True).to(self.device) - elif posmorphencodertype == 'gru': - self.posmorphencoder = nn.GRU(input_size=self.embeddingdim + 5, hidden_size=self.posmorphrnndim // 2, - num_layers=self.posmorphrnnnumlayers, bidirectional=self.posmorphrnnbidirectional, + elif morphencodertype == 'gru': + self.morphencoder = nn.GRU(input_size=self.embeddingdim + len(self.postagset) + len(self.supertokenset) + 1, hidden_size=self.morphrnndim // 2, + num_layers=self.morphrnnnumlayers, bidirectional=self.morphrnnbidirectional, batch_first=True).to(self.device) # param init - for name, param in self.posmorphencoder.named_parameters(): + for name, param in self.morphencoder.named_parameters(): try: if 'bias' in name: nn.init.constant_(param, 0.0) @@ -134,6 +166,7 @@ def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnn except ValueError as ex: nn.init.constant_(param, 0.0) + self.relu = nn.ReLU() # Reproject embeddings layer @@ -149,19 +182,32 @@ def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnn nn.init.xavier_normal_(param) # Intermediate feedforward layer - self.posmorphembedding2nn = nn.Linear(in_features=self.embeddingdim + 5,out_features=self.embeddingdim + 5).to(self.device) - self.posmorphfflayerdim = posmorphfflayerdim - self.posmorphfflayer = nn.Linear(in_features=self.posmorphrnndim, out_features=self.posmorphfflayerdim).to(self.device) + self.posembedding2nn = nn.Linear(in_features=self.embeddingdim + len(self.supertokenset) + 1,out_features=self.embeddingdim + len(self.supertokenset) + 1 ).to(self.device) + self.posfflayerdim = posfflayerdim + self.posfflayer = nn.Linear(in_features=self.posrnndim, out_features=self.posfflayerdim).to(self.device) # param init - for name, param in self.posmorphembedding2nn.named_parameters(): + for name, param in self.posembedding2nn.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + + # Intermediate feedforward layer + self.morphembedding2nn = nn.Linear(in_features=self.embeddingdim + len(self.postagset) + len(self.supertokenset) + 1 , + out_features=self.embeddingdim + len(self.postagset) + len(self.supertokenset) + 1).to(self.device) + self.morphfflayerdim = morphfflayerdim + self.morphfflayer = nn.Linear(in_features=self.morphrnndim, out_features=self.morphfflayerdim).to(self.device) + + # param init + for name, param in self.morphembedding2nn.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_normal_(param) # Label space for the pos tagger - self.hidden2postag = nn.Linear(in_features=self.posmorphfflayerdim,out_features=len(self.postagsetcrf)).to(self.device) + self.hidden2postag = nn.Linear(in_features=self.posfflayerdim,out_features=len(self.postagsetcrf)).to(self.device) for name, param in self.hidden2postag.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) @@ -179,7 +225,7 @@ def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnn # label space for morph feats - self.hidden2feats = nn.Linear(in_features=self.posmorphfflayerdim,out_features=len(self.featstagset)).to(self.device) + self.hidden2feats = nn.Linear(in_features=self.morphfflayerdim,out_features=len(self.featstagset)).to(self.device) for name, param in self.hidden2feats.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) @@ -200,9 +246,6 @@ def __init__(self,sbdrnndim=128,posmorphrnndim=256,sbdrnnnumlayers=1,posmorphrnn nn.init.xavier_normal_(param) self.stride_size = 10 - self.sbdencodertype = sbdencodertype - self.posmorphencodertype = posmorphencodertype - self.sigmoid = nn.Sigmoid() def shingle(self,toks,labels=None): @@ -287,10 +330,12 @@ def forward(self,data,mode='train'): key = f.split('=')[0] value = f.split('=')[1] if ',' not in value: + if f not in self.featstagset.keys(): continue tempfeats[self.featstagset[f]] = 1 else: value = value.split(',') for v in value: + if key + '=' + v not in self.featstagset.keys(): continue tempfeats[self.featstagset[key + '=' + v]] = 1 featsrecord.append(tempfeats) @@ -318,10 +363,12 @@ def forward(self,data,mode='train'): key = f.split('=')[0] value = f.split('=')[1] if ',' not in value: + if f not in self.featstagset.keys(): continue tempfeats[self.featstagset[f]] = 1 else: value = value.split(',') for v in value: + if key + '=' + v not in self.featstagset.keys(): continue tempfeats[self.featstagset[key + '=' + v]] = 1 featslabels.append(tempfeats) @@ -460,23 +507,15 @@ def forward(self,data,mode='train'): supertokenlabels = torch.unsqueeze(supertokenlabels,dim=0) # Add the SBD predictions to the POS Encoder Input! - posmorphembeddings = torch.cat((avgembeddings,sbdpreds,supertokenlabels),dim=2) - - if mode in ('dev','test'): - # Squeeze these to return to the Trainer for scores, now that we are done with them - sbdpreds = torch.squeeze(sbdpreds,dim=2) - sbdpreds = torch.squeeze(sbdpreds, dim=0) - sbdpreds = sbdpreds.tolist() - else: - sbdpreds = None + posembeddings = torch.cat((avgembeddings,sbdpreds,supertokenlabels),dim=2) - posmorphembeddings = self.dropout(posmorphembeddings) - posmorphembeddings = self.worddropout(posmorphembeddings) - posmorphembeddings = self.lockeddropout(posmorphembeddings) - posmorphembeddings = self.posmorphembedding2nn(posmorphembeddings) + posembeddings = self.dropout(posembeddings) + posembeddings = self.worddropout(posembeddings) + posembeddings = self.lockeddropout(posembeddings) + posembeddings = self.posembedding2nn(posembeddings) - feats,_ = self.posmorphencoder(posmorphembeddings) - feats = self.posmorphfflayer(feats) + feats,_ = self.posencoder(posembeddings) + feats = self.posfflayer(feats) feats = self.relu(feats) feats = self.dropout(feats) feats = self.lockeddropout(feats) @@ -485,15 +524,64 @@ def forward(self,data,mode='train'): poslogits = self.hidden2postag(feats) poslogits = self.poscrf(poslogits) + # get the pos CRF predictions + if mode == 'train': + lengths = [self.sequence_length] * self.batch_size + else: + lengths = [self.sequence_length] + scores = (poslogits, lengths, self.poscrf.transitions) + sents = [] + for s in sentences: + sents.append(Sentence(' '.join(s),use_tokenizer=False)) + + pospreds = self.viterbidecoder.decode(scores, False, sents) + pospreds = [[self.postagsetcrf.get_idx_for_item(p[0])for p in pr] for pr in pospreds[0]] + pospredsonehot = [] + for pred in pospreds: + preds = [] + for p in pred: + onehot = [0] * len(self.postagset.keys()) + onehot[p-1] = 1 + preds.append(onehot) + pospredsonehot.append(preds) + + pospredsonehot = torch.LongTensor(pospredsonehot) + pospredsonehot = pospredsonehot.to(self.device) + + morphembeddings = torch.cat((avgembeddings, sbdpreds, supertokenlabels,pospredsonehot), dim=2) + morphembeddings = self.dropout(morphembeddings) + morphembeddings = self.worddropout(morphembeddings) + morphembeddings = self.lockeddropout(morphembeddings) + morphembeddings = self.morphembedding2nn(morphembeddings) + + feats, _ = self.morphencoder(morphembeddings) + feats = self.morphfflayer(feats) + feats = self.relu(feats) + feats = self.dropout(feats) + feats = self.lockeddropout(feats) + # logits for morphs featslogits = self.hidden2feats(feats) - return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels, featslogits,featslabels # returns the logits and labels + if mode in ('dev','test'): + # Squeeze these to return to the Trainer for scores, now that we are done with them + sbdpreds = torch.squeeze(sbdpreds,dim=2) + sbdpreds = torch.squeeze(sbdpreds, dim=0) + sbdpreds = sbdpreds.tolist() + + # Unroll the pos predictions + pospreds = [p for pred in pospreds for p in pred] + + else: + sbdpreds = None + pospreds = None + + return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels, pospreds, featslogits,featslabels # returns the logits and labels class Tagger(): def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=128,sbdrnnnumlayers=1,sbdrnnbidirectional=True,sbdfflayerdim=256,posrnndim=256,posrnnnumlayers=1,posrnnbidirectional=True,posfflayerdim=512,dropout=0.05,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',posencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/',batchsize=32,sequencelength=256,datatype='htb'): - self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posmorphrnndim=posrnndim,posmorphrnnbidirectional=posrnnbidirectional,posmorphencodertype=posencodertype,posmorphrnnnumlayers=posrnnnumlayers,posmorphfflayerdim=posfflayerdim,batchsize=batchsize,sequencelength=sequencelength) + self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posrnndim=posrnndim,posrnnbidirectional=posrnnbidirectional,posencodertype=posencodertype,posrnnnumlayers=posrnnnumlayers,posfflayerdim=posfflayerdim,batchsize=batchsize,sequencelength=sequencelength) if trainflag == True: @@ -533,12 +621,13 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr self.featsloss.to(self.device) self.optimizer = torch.optim.AdamW(list(self.mtlmodel.sbdencoder.parameters()) + list(self.mtlmodel.sbdembedding2nn.parameters()) + - list(self.mtlmodel.hidden2sbd.parameters()) + list(self.mtlmodel.posmorphencoder.parameters()) + list(self.mtlmodel.posmorphembedding2nn.parameters()) + list(self.mtlmodel.hidden2sbd.parameters()) + list(self.mtlmodel.posencoder.parameters()) + list(self.mtlmodel.posembedding2nn.parameters()) + list(self.mtlmodel.hidden2postag.parameters()) + list(self.mtlmodel.poscrf.parameters()) - + list(self.mtlmodel.hidden2feats.parameters()), lr=learningrate) + + list(self.mtlmodel.hidden2feats.parameters()) + list(self.mtlmodel.morphfflayer.parameters()) + list(self.mtlmodel.morphembedding2nn.parameters()) + list(self.mtlmodel.morphencoder.parameters()) + + list(self.mtlmodel.posfflayer.parameters()) + list(self.mtlmodel.sbdfflayer.parameters()), lr=learningrate) self.scheduler = torch.optim.lr_scheduler.CyclicLR(self.optimizer,base_lr=learningrate/10,max_lr=learningrate,step_size_up=250,cycle_momentum=False) - self.evalstep = 1 + self.evalstep = 50 self.sigmoidthreshold = 0.5 @@ -610,7 +699,7 @@ def read_file(mode='train'): data = [datum for datum in data if len(datum) == self.mtlmodel.sequence_length] self.mtlmodel.batch_size = len(data) - sbdlogits, sbdlabels, _, poslogits,poslabels, featslogits, featslabels = self.mtlmodel(data) + sbdlogits, sbdlabels, _, poslogits, poslabels, _ , featslogits, featslabels = self.mtlmodel(data) sbdtags = torch.LongTensor(sbdlabels).to(self.device) sbdlogits = sbdlogits.permute(0,2,1) @@ -662,7 +751,7 @@ def read_file(mode='train'): allfeatspreds = [] # because of shingling for SBD, the dev data needs to be split in slices for inference, as GPU may run out of memory with shingles on the full token list. - # shingling and SBD prediction is done on the individual slice, as well as POS tag predictions. + # shingling and SBD prediction is done on the individual slice, as well as POS tag predictions and feats. # TODO This naturally increases prediction time...but can't think of a better way. for slice in devdata: @@ -670,25 +759,15 @@ def read_file(mode='train'): if len(slice) != self.mtlmodel.sequence_length: # this will happen in one case, for the last slice in the dev batch self.mtlmodel.sequence_length = len(slice) - # Flair CRF decoding uses the Sentence object.. - sentence = ' '.join([s.split('\t')[0].strip() for s in slice]) - sentence = Sentence(sentence,use_tokenizer=False) - + # Extract gold labels goldsbdlabels = [s.split('\t')[2].strip() for s in slice] goldsbdlabels = [self.mtlmodel.sbd_tag2idx[s] for s in goldsbdlabels] goldposlabels = [s.split('\t')[1].strip() for s in slice] goldposlabels = [self.mtlmodel.postagsetcrf.get_idx_for_item(s) for s in goldposlabels] goldfeatslabels = [s.split('\t')[4].strip() for s in slice] - # sbdpreds already contains the sbd predictions. These were necessary for input to the POS encoder. - sbdlogits, sbdlabels, sbdpreds, poslogits, poslabels, featslogits, featslabels = self.mtlmodel(slice,mode='dev') - - # get the pos predictions - lengths = [self.mtlmodel.sequence_length] - lengths = torch.LongTensor(lengths).to(self.device) - scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) - pospreds = self.mtlmodel.viterbidecoder.decode(scores,False,[sentence]) - pospreds = [self.mtlmodel.postagsetcrf.get_idx_for_item(p[0]) for pr in pospreds[0] for p in pr] + # RUn through the model and get the labels and logits (and preds for stepwise models) + sbdlogits, sbdlabels, sbdpreds, poslogits, poslabels, pospreds, featslogits, featslabels = self.mtlmodel(slice,mode='dev') # get the feats predictions featspreds = self.mtlmodel.sigmoid(featslogits) @@ -703,6 +782,9 @@ def read_file(mode='train'): # get the pos loss postags = torch.LongTensor(poslabels) postags = postags.to(self.device) + lengths = [self.mtlmodel.sequence_length] + lengths = torch.LongTensor(lengths).to(self.device) + scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) posdevloss = self.postagloss(scores,postags).item() # get the feats loss @@ -711,6 +793,7 @@ def read_file(mode='train'): featstags = torch.unsqueeze(featstags,dim=0) featsdevloss = self.featsloss(featslogits,featstags).item() + # add up the losses across the slices for the avg totalsbddevloss += sbddevloss totalposdevloss += posdevloss totalfeatsdevloss += featsdevloss @@ -721,9 +804,6 @@ def read_file(mode='train'): featsstr = '' for i in range(0,len(preds)): if preds[i] != 0: - #if i == 0: - # featsstr += self.mtlmodel.idxtofeatstagset[i] - #else: if self.mtlmodel.idxtofeatstagset[i].split('=')[0] == featsstr.split('|')[-1].split('=')[0]: featsstr += ',' + self.mtlmodel.idxtofeatstagset[i].split('=')[1] else: @@ -734,6 +814,7 @@ def read_file(mode='train'): featsslicepreds.append(featsstr) + # build the gold and predictions for the entire dev set allsbdpreds.extend(sbdpreds) allsbdgold.extend(goldsbdlabels) allpospreds.extend(pospreds) @@ -746,6 +827,7 @@ def read_file(mode='train'): if self.mtlmodel.sequence_length != old_seqlen: self.mtlmodel.sequence_length = old_seqlen + # Now get the scores goldspans = [] predspans = [] goldstartindex = 0 @@ -767,6 +849,7 @@ def read_file(mode='train'): correctfeats = sum([1 if p == g else 0 for p,g in zip(allfeatspreds,allfeatsgold)]) featsscores = Score(len(allfeatsgold),len(allfeatspreds),correctfeats,len(allfeatspreds)) + # Write the scores and losses to tensorboard and console mtlloss = (totalsbddevloss + totalposdevloss + totalfeatsdevloss) / len(devdata) self.writer.add_scalar("mtl_dev_loss", round(mtlloss, 4), @@ -806,12 +889,13 @@ def read_file(mode='train'): print('feats dev recall:' + str(featsscores.recall)) print('\n') + # save the best model if mtlloss < bestloss: bestloss = mtlloss - bestmodel = self.bestmodel.replace('.pt','_' + str(round(mtlloss,6)) + '_' + str(round(sbdscores.f1,6)) + '_' + str(round(posscores.f1,6)) + '.pt') + bestmodel = self.bestmodel.replace('.pt','_' + str(round(mtlloss,6)) + '_' + str(round(sbdscores.f1,6)) + '_' + str(round(posscores.f1,6)) + '_' + str(round(featsscores.f1,6)) + '.pt') torch.save({'epoch':epoch,'model_state_dict':self.mtlmodel.state_dict(),'optimizer_state_dict':self.optimizer.state_dict(),'poscrf_state_dict':self.mtlmodel.poscrf.state_dict()},bestmodel) - def predict(self,toks,checkpointfile=None): + def inference(self,toks,checkpointfile=None): def is_tok(sgml_line): return len(sgml_line) > 0 and not (sgml_line.startswith("<") and sgml_line.endswith(">")) @@ -881,33 +965,53 @@ def unescape(token): with torch.no_grad(): + allwords = [] allsbdpreds = [] allpospreds = [] + allfeatspreds = [] for slice in slices: if len(slice[0]) != self.mtlmodel.sequence_length: # this will happen in one case, for the last slice in the batch self.mtlmodel.sequence_length = len(slice[0]) - # Flair CRF decoding uses the Sentence object.. - sentence = ' '.join([s.split('\t')[0].strip() for s in slice[0]]) - sentence = Sentence(sentence, use_tokenizer=False) + _, _, sbdpreds, _,_,pospreds, featslogits,_ = self.mtlmodel(slice, mode='test') - _, _, sbdpreds, poslogits, _ = self.mtlmodel(slice, mode='test') + # get the feats predictions + featspreds = self.mtlmodel.sigmoid(featslogits) + featspreds = (featspreds > self.sigmoidthreshold).long() + featspreds = torch.squeeze(featspreds).tolist() + + featsslicepreds = [] + for preds in featspreds: + featsstr = '' + for i in range(0, len(preds)): + if preds[i] != 0: + if self.mtlmodel.idxtofeatstagset[i].split('=')[0] == featsstr.split('|')[-1].split('=')[0]: + featsstr += ',' + self.mtlmodel.idxtofeatstagset[i].split('=')[1] + else: + if featsstr != '': + featsstr = featsstr + '|' + self.mtlmodel.idxtofeatstagset[i] + else: + featsstr += self.mtlmodel.idxtofeatstagset[i] + + featsslicepreds.append(featsstr) # get the pos predictions - lengths = [self.mtlmodel.sequence_length] - lengths = torch.LongTensor(lengths).to(self.device) - scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) - pospreds = self.mtlmodel.viterbidecoder.decode(scores, False, [sentence]) - pospreds = [self.mtlmodel.postagsetcrf.get_idx_for_item(p[0]) for pr in pospreds[0] for p in pr] + #lengths = [self.mtlmodel.sequence_length] + #lengths = torch.LongTensor(lengths).to(self.device) + #scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) + #pospreds = self.mtlmodel.viterbidecoder.decode(scores, False, [sentence]) + #pospreds = [self.mtlmodel.postagsetcrf.get_idx_for_item(p[0]) for pr in pospreds[0] for p in pr] allsbdpreds.extend(sbdpreds) allpospreds.extend(pospreds) + allfeatspreds.extend(featsslicepreds) + allwords.extend([s.split('\t')[0].strip() for s in slice[0]]) allpospreds = [self.mtlmodel.postagsetcrf.get_item_for_index(p) for p in allpospreds] - return allsbdpreds,allpospreds + return allsbdpreds,allpospreds, allfeatspreds, allwords def prepare_data_files(self): """ @@ -977,7 +1081,7 @@ def read_conllu(self,mode='train'): with open(file, "r", encoding="utf-8") as f: return conllu.parse(f.read(), fields=fields) - def split_pos(self, xml_data,out_mode='conllu',checkpointfile = None): + def predict(self, xml_data,out_mode='conllu',checkpointfile = None): def is_sgml_tag(line): return line.startswith("<") and line.endswith(">") @@ -1142,7 +1246,7 @@ def get_bound_group_map(data): # don't feed the sentencer our pos and lemma predictions, if we have them no_pos_lemma = re.sub(r"([^\n\t]*?)\t[^\n\t]*?\t[^\n\t]*?\n", r"\1\n", xml_data) - split_indices, pos_tags = self.predict(no_pos_lemma,checkpointfile=checkpointfile) + split_indices, pos_tags, morphs, words = self.inference(no_pos_lemma,checkpointfile=checkpointfile) # for xml counter = 0 @@ -1232,7 +1336,9 @@ def get_bound_group_map(data): assert k == len(pos_tags) # Fails means pos tags aren't aligned with tokens - return "\n".join(output), lines + morphs = [m if m != '' else '_' for m in morphs] + + return "\n".join(output), lines, morphs, words def spans_score(self, gold_spans, system_spans): correct, gi, si = 0, 0, 0 From 8cd80dd48abb509992865adfeca89e54b6bec18e Mon Sep 17 00:00:00 2001 From: nitin Date: Fri, 19 Aug 2022 03:43:35 +0800 Subject: [PATCH 21/32] fixed bugs --- hebpipe/lib/mtlmodel.py | 93 ++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/hebpipe/lib/mtlmodel.py b/hebpipe/lib/mtlmodel.py index 6dab30f..375f5a6 100644 --- a/hebpipe/lib/mtlmodel.py +++ b/hebpipe/lib/mtlmodel.py @@ -43,7 +43,7 @@ def __init__(self, start, end): class MTLModel(nn.Module): - def __init__(self,sbdrnndim=128,posrnndim=256,morphrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers=1,morphrnnnumlayers=1,posfflayerdim=512,morphfflayerdim=512,sbdrnnbidirectional=True,posrnnbidirectional=True,morphrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posencodertype='lstm',morphencodertype='lstm',batchsize=16,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): + def __init__(self,sbdrnndim=128,posrnndim=256,morphrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers=1,morphrnnnumlayers=1,posfflayerdim=512,morphfflayerdim=512,sbdrnnbidirectional=True,posrnnbidirectional=True,morphrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posencodertype='lstm',morphencodertype='lstm',batchsize=32,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -72,9 +72,6 @@ def __init__(self,sbdrnndim=128,posrnndim=256,morphrnndim=256,sbdrnnnumlayers=1, 'Typo=Yes':43, 'VerbForm=Inf':44, 'VerbForm=Part':45, 'VerbType=Cop':46, 'VerbType=Mod':47, 'Voice=Act':48, 'Voice=Mid':49, 'Voice=Pass':50} """ - - # {"Gender","Number","Tense","VerbForm","Voice","HebBinyan","Definite"} - self.featstagset = {'Definite=Com':0, 'Definite=Cons':1, 'Definite=Def':2, 'Definite=Ind':3, 'Definite=Spec':4, 'Gender=Fem':5, 'Gender=Masc':6, 'HebBinyan=HIFIL':7, 'HebBinyan=HITPAEL':8, 'HebBinyan=HUFAL':9, 'HebBinyan=NIFAL':10, 'HebBinyan=NITPAEL':11, 'HebBinyan=PAAL':12, 'HebBinyan=PIEL':13, 'HebBinyan=PUAL':14, 'Number=Dual':15, 'Number=Plur':16, 'Number=Sing':17, @@ -148,11 +145,11 @@ def __init__(self,sbdrnndim=128,posrnndim=256,morphrnndim=256,sbdrnnnumlayers=1, nn.init.constant_(param, 0.0) if morphencodertype == 'lstm': - self.morphencoder = nn.LSTM(input_size=self.embeddingdim + len(self.postagset) + len(self.supertokenset) + 1, hidden_size=self.morphrnndim // 2, + self.morphencoder = nn.LSTM(input_size=self.embeddingdim + len(self.postagsetcrf) + len(self.supertokenset) + 1, hidden_size=self.morphrnndim // 2, num_layers=self.morphrnnnumlayers, bidirectional=self.morphrnnbidirectional, batch_first=True).to(self.device) elif morphencodertype == 'gru': - self.morphencoder = nn.GRU(input_size=self.embeddingdim + len(self.postagset) + len(self.supertokenset) + 1, hidden_size=self.morphrnndim // 2, + self.morphencoder = nn.GRU(input_size=self.embeddingdim + len(self.postagsetcrf) + len(self.supertokenset) + 1, hidden_size=self.morphrnndim // 2, num_layers=self.morphrnnnumlayers, bidirectional=self.morphrnnbidirectional, batch_first=True).to(self.device) @@ -181,6 +178,13 @@ def __init__(self,sbdrnndim=128,posrnndim=256,morphrnndim=256,sbdrnnnumlayers=1, elif 'weight' in name: nn.init.xavier_normal_(param) + for name, param in self.sbdfflayer.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + + # Intermediate feedforward layer self.posembedding2nn = nn.Linear(in_features=self.embeddingdim + len(self.supertokenset) + 1,out_features=self.embeddingdim + len(self.supertokenset) + 1 ).to(self.device) self.posfflayerdim = posfflayerdim @@ -193,9 +197,15 @@ def __init__(self,sbdrnndim=128,posrnndim=256,morphrnndim=256,sbdrnnnumlayers=1, elif 'weight' in name: nn.init.xavier_normal_(param) + for name, param in self.posfflayer.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + # Intermediate feedforward layer - self.morphembedding2nn = nn.Linear(in_features=self.embeddingdim + len(self.postagset) + len(self.supertokenset) + 1 , - out_features=self.embeddingdim + len(self.postagset) + len(self.supertokenset) + 1).to(self.device) + self.morphembedding2nn = nn.Linear(in_features=self.embeddingdim + len(self.postagsetcrf) + len(self.supertokenset) + 1 , + out_features=self.embeddingdim + len(self.postagsetcrf) + len(self.supertokenset) + 1).to(self.device) self.morphfflayerdim = morphfflayerdim self.morphfflayer = nn.Linear(in_features=self.morphrnndim, out_features=self.morphfflayerdim).to(self.device) @@ -206,6 +216,13 @@ def __init__(self,sbdrnndim=128,posrnndim=256,morphrnndim=256,sbdrnnnumlayers=1, elif 'weight' in name: nn.init.xavier_normal_(param) + # param init + for name, param in self.morphfflayer.named_parameters(): + if 'bias' in name: + nn.init.constant_(param, 0.0) + elif 'weight' in name: + nn.init.xavier_normal_(param) + # Label space for the pos tagger self.hidden2postag = nn.Linear(in_features=self.posfflayerdim,out_features=len(self.postagsetcrf)).to(self.device) for name, param in self.hidden2postag.named_parameters(): @@ -540,8 +557,8 @@ def forward(self,data,mode='train'): for pred in pospreds: preds = [] for p in pred: - onehot = [0] * len(self.postagset.keys()) - onehot[p-1] = 1 + onehot = [0] * len(self.postagsetcrf) + onehot[p] = 1 preds.append(onehot) pospredsonehot.append(preds) @@ -579,9 +596,9 @@ def forward(self,data,mode='train'): return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels, pospreds, featslogits,featslabels # returns the logits and labels class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=128,sbdrnnnumlayers=1,sbdrnnbidirectional=True,sbdfflayerdim=256,posrnndim=256,posrnnnumlayers=1,posrnnbidirectional=True,posfflayerdim=512,dropout=0.05,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',posencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/',batchsize=32,sequencelength=256,datatype='htb'): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=128,sbdrnnnumlayers=1,sbdrnnbidirectional=True,sbdfflayerdim=256,posrnndim=256,posrnnnumlayers=1,posrnnbidirectional=True,posfflayerdim=512,morphrnndim=256,morphrnnnumlayers=1,morphrnnbidirectional=True,morphfflayerdim=512,morphencodertype='lstm',dropout=0.05,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',posencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/',batchsize=32,sequencelength=256,datatype='htb'): - self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posrnndim=posrnndim,posrnnbidirectional=posrnnbidirectional,posencodertype=posencodertype,posrnnnumlayers=posrnnnumlayers,posfflayerdim=posfflayerdim,batchsize=batchsize,sequencelength=sequencelength) + self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posrnndim=posrnndim,posrnnbidirectional=posrnnbidirectional,posencodertype=posencodertype,posrnnnumlayers=posrnnnumlayers,posfflayerdim=posfflayerdim,morphrnndim=morphrnndim,morphrnnnumlayers=morphrnnnumlayers,morphencodertype=morphencodertype,morphrnnbidirectional=morphrnnbidirectional,morphfflayerdim=morphfflayerdim,batchsize=batchsize,sequencelength=sequencelength) if trainflag == True: @@ -674,7 +691,7 @@ def read_file(mode='train'): return dataset - epochs = 3500 + epochs = 5000 bestloss = float('inf') trainingdata = read_file() @@ -997,13 +1014,6 @@ def unescape(token): featsslicepreds.append(featsstr) - # get the pos predictions - #lengths = [self.mtlmodel.sequence_length] - #lengths = torch.LongTensor(lengths).to(self.device) - #scores = (poslogits, lengths, self.mtlmodel.poscrf.transitions) - #pospreds = self.mtlmodel.viterbidecoder.decode(scores, False, [sentence]) - #pospreds = [self.mtlmodel.postagsetcrf.get_idx_for_item(p[0]) for pr in pospreds[0] for p in pr] - allsbdpreds.extend(sbdpreds) allpospreds.extend(pospreds) allfeatspreds.extend(featsslicepreds) @@ -1360,7 +1370,7 @@ def main(): # testing only parser = argparse.ArgumentParser() parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--seqlen', type=int, default=192) + parser.add_argument('--seqlen', type=int, default=256) parser.add_argument('--trainbatch', type=int, default=16) parser.add_argument('--datatype', type=str, default='wiki') parser.add_argument('--sbdrnndim', type=int, default=256) @@ -1380,32 +1390,29 @@ def main(): # testing only args = parser.parse_args() - iahltwikitrain = '../he_iahltwiki-ud-train.conllu' - iahltwikidev = '../he_iahltwiki-ud-dev.conllu' - - htbdev = '../he_htb-ud-dev.conllu' - htbtrain = '../he_htb-ud-train.conllu' - if args.datatype == 'wiki': - tagger = Tagger(trainflag=True, trainfile=iahltwikitrain, devfile=iahltwikidev, sbdrnndim=args.sbdrnndim, sbdfflayerdim=args.sbdfflayerdim, - posrnndim=args.posrnndim, posfflayerdim=args.posfflayerdim, sbdrnnbidirectional=args.sbdrnnbidirectional, - posrnnbidirectional=args.posrnnbidirectional, sbdrnnnumlayers=args.sbdrnnnumlayers, - posrnnnumlayers=args.posrnnnumlayers, sbdencodertype=args.sbdencodertype, - posencodertype=args.posencodertype - , learningrate=args.lr, batchsize=args.trainbatch, sequencelength=args.seqlen, - dropout=args.dropout, wordropout=args.worddropout, lockeddropout=args.lockeddropout,datatype=args.datatype) - + trainfile = '../he_iahltwiki-ud-train.conllu' + devfile = '../he_iahltwiki-ud-dev.conllu' else: - tagger = Tagger(trainflag=True, trainfile=htbtrain, devfile=htbdev, sbdrnndim=args.sbdrnndim,sbdfflayerdim=args.sbdfflayerdim, - posrnndim=args.posrnndim, posfflayerdim=args.posfflayerdim,sbdrnnbidirectional=args.sbdrnnbidirectional, - posrnnbidirectional=args.posrnnbidirectional, sbdrnnnumlayers=args.sbdrnnnumlayers, - posrnnnumlayers=args.posrnnnumlayers, sbdencodertype=args.sbdencodertype, - posencodertype=args.posencodertype - , learningrate=args.lr, batchsize=args.trainbatch, sequencelength=args.seqlen, - dropout=args.dropout, wordropout=args.worddropout, lockeddropout=args.lockeddropout,datatype=args.datatype) + devfile = '../he_htb-ud-dev.conllu' + trainfile = '../he_htb-ud-train.conllu' + + tagger = Tagger(trainflag=True, trainfile=trainfile, devfile=devfile, sbdrnndim=args.sbdrnndim, + sbdfflayerdim=args.sbdfflayerdim, + posrnndim=args.posrnndim, posfflayerdim=args.posfflayerdim, + sbdrnnbidirectional=args.sbdrnnbidirectional, + posrnnbidirectional=args.posrnnbidirectional, sbdrnnnumlayers=args.sbdrnnnumlayers, + posrnnnumlayers=args.posrnnnumlayers, sbdencodertype=args.sbdencodertype, + posencodertype=args.posencodertype, + morphrnnbidirectional=args.morphrnnbidirectional, morphrnndim=args.morphrnndim, + morphfflayerdim=args.morphfflayerdim, morphencodertype=args.morphencodertype, + morphrnnnumlayers=args.morphrnnnumlayers, + learningrate=args.lr, batchsize=args.trainbatch, sequencelength=args.seqlen, + dropout=args.dropout, wordropout=args.worddropout, lockeddropout=args.lockeddropout, + datatype=args.datatype) tagger.prepare_data_files() - #tagger.train(checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/htb_best_sent_pos_model_13.316283_0.979424_0.98009.pt') + # tagger.train(checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/htb_best_sent_pos_model_13.316283_0.979424_0.98009.pt') tagger.train() From ec38cec1facd977f2b6a2fe01fa3738082a4d575 Mon Sep 17 00:00:00 2001 From: nitin Date: Fri, 19 Aug 2022 13:32:46 +0800 Subject: [PATCH 22/32] mtl with feats --- hebpipe/heb_pipe.py | 13 ++----------- hebpipe/lib/mtlmodel.py | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 319a6de..18b7542 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -12,11 +12,6 @@ from stanza.models.common.doc import Document import torch -from time import time -import cProfile -import pstats - - from rftokenizer import RFTokenizer try: # Module usage from .lib.xrenner import Xrenner @@ -601,11 +596,7 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, if from_pipes: input_data = input_data.replace("|","") - # Wiki - mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=256,sbdrnndim=256,posrnndim=512,sbdfflayerdim=256) - - # HTB - #mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=256,sbdrnndim=256,posrnndim=512,sbdfflayerdim=256) + mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=256) if preloaded is not None: @@ -668,7 +659,7 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, """ sent_tag = 's' - tagged_conllu, tokenized, morphs, words = mtltagger.predict(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/wiki_best_sent_pos_model_19.233182_0.864943_0.971897_0.710666.pt') + tagged_conllu, tokenized, morphs, words = mtltagger.predict(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/top_wiki_best_mtlmodel_38.503373_0.890511_0.972855_0.72344.pt') del mtltagger del rf_tok diff --git a/hebpipe/lib/mtlmodel.py b/hebpipe/lib/mtlmodel.py index 375f5a6..c56d83d 100644 --- a/hebpipe/lib/mtlmodel.py +++ b/hebpipe/lib/mtlmodel.py @@ -43,7 +43,7 @@ def __init__(self, start, end): class MTLModel(nn.Module): - def __init__(self,sbdrnndim=128,posrnndim=256,morphrnndim=256,sbdrnnnumlayers=1,posrnnnumlayers=1,morphrnnnumlayers=1,posfflayerdim=512,morphfflayerdim=512,sbdrnnbidirectional=True,posrnnbidirectional=True,morphrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posencodertype='lstm',morphencodertype='lstm',batchsize=32,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): + def __init__(self,sbdrnndim=256,posrnndim=512,morphrnndim=512,sbdrnnnumlayers=1,posrnnnumlayers=1,morphrnnnumlayers=1,posfflayerdim=512,morphfflayerdim=512,sbdrnnbidirectional=True,posrnnbidirectional=True,morphrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posencodertype='lstm',morphencodertype='lstm',batchsize=32,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): super(MTLModel,self).__init__() self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -596,7 +596,7 @@ def forward(self,data,mode='train'): return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels, pospreds, featslogits,featslabels # returns the logits and labels class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=128,sbdrnnnumlayers=1,sbdrnnbidirectional=True,sbdfflayerdim=256,posrnndim=256,posrnnnumlayers=1,posrnnbidirectional=True,posfflayerdim=512,morphrnndim=256,morphrnnnumlayers=1,morphrnnbidirectional=True,morphfflayerdim=512,morphencodertype='lstm',dropout=0.05,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',posencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/',batchsize=32,sequencelength=256,datatype='htb'): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=256,sbdrnnnumlayers=1,sbdrnnbidirectional=True,sbdfflayerdim=256,posrnndim=512,posrnnnumlayers=1,posrnnbidirectional=True,posfflayerdim=512,morphrnndim=512,morphrnnnumlayers=1,morphrnnbidirectional=True,morphfflayerdim=512,morphencodertype='lstm',dropout=0.05,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',posencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/',batchsize=32,sequencelength=256,datatype='htb'): self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posrnndim=posrnndim,posrnnbidirectional=posrnnbidirectional,posencodertype=posencodertype,posrnnnumlayers=posrnnnumlayers,posfflayerdim=posfflayerdim,morphrnndim=morphrnndim,morphrnnnumlayers=morphrnnnumlayers,morphencodertype=morphencodertype,morphrnnbidirectional=morphrnnbidirectional,morphfflayerdim=morphfflayerdim,batchsize=batchsize,sequencelength=sequencelength) @@ -615,10 +615,11 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr self.trainingdatafile = '../data/sentsplit_postag_train_gold.tab' self.devdatafile = '../data/sentsplit_postag_dev_gold.tab' - self.bestmodel = bestmodelpath + datatype + '_best_sent_pos_model.pt' + self.bestmodel = bestmodelpath + datatype + '_best_mtlmodel.pt' self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.trainflag = trainflag self.trainfile = trainfile self.devfile = devfile @@ -692,7 +693,7 @@ def read_file(mode='train'): return dataset epochs = 5000 - bestloss = float('inf') + bestf1 = float('-inf') trainingdata = read_file() devdata = read_file(mode='dev') @@ -907,8 +908,8 @@ def read_file(mode='train'): print('\n') # save the best model - if mtlloss < bestloss: - bestloss = mtlloss + if (sbdscores.f1 + posscores.f1 + featsscores.f1) / 3 > bestf1: + bestf1 = (sbdscores.f1 + posscores.f1 + featsscores.f1) / 3 bestmodel = self.bestmodel.replace('.pt','_' + str(round(mtlloss,6)) + '_' + str(round(sbdscores.f1,6)) + '_' + str(round(posscores.f1,6)) + '_' + str(round(featsscores.f1,6)) + '.pt') torch.save({'epoch':epoch,'model_state_dict':self.mtlmodel.state_dict(),'optimizer_state_dict':self.optimizer.state_dict(),'poscrf_state_dict':self.mtlmodel.poscrf.state_dict()},bestmodel) @@ -1371,18 +1372,23 @@ def main(): # testing only parser = argparse.ArgumentParser() parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--seqlen', type=int, default=256) - parser.add_argument('--trainbatch', type=int, default=16) + parser.add_argument('--trainbatch', type=int, default=32) parser.add_argument('--datatype', type=str, default='wiki') parser.add_argument('--sbdrnndim', type=int, default=256) parser.add_argument('--posrnndim', type=int, default=512) + parser.add_argument('--morphrnndim', type=int, default=512) parser.add_argument('--sbdfflayerdim', type=int, default=256) parser.add_argument('--posfflayerdim', type=int, default=512) + parser.add_argument('--morphfflayerdim', type=int, default=512) parser.add_argument('--posrnnbidirectional', type=bool, default=True) parser.add_argument('--sbdrnnbidirectional', type=bool, default=True) + parser.add_argument('--morphrnnbidirectional', type=bool, default=True) parser.add_argument('--posrnnnumlayers', type=int, default=1) parser.add_argument('--sbdrnnnumlayers', type=int, default=1) + parser.add_argument('--morphrnnnumlayers', type=int, default=1) parser.add_argument('--sbdencodertype', type=str, default='lstm') parser.add_argument('--posencodertype', type=str, default='lstm') + parser.add_argument('--morphencodertype', type=str, default='lstm') parser.add_argument('--dropout', type=float, default=0.05) parser.add_argument('--worddropout', type=float, default=0.05) parser.add_argument('--lockeddropout', type=float, default=0.5) From d88c06f9156ae3fdfa76357e411b9d4af8e2e2e0 Mon Sep 17 00:00:00 2001 From: amir-zeldes Date: Wed, 24 Aug 2022 13:43:04 -0400 Subject: [PATCH 23/32] update models URL --- hebpipe/heb_pipe.py | 2 +- hebpipe/lib/_version.py | 2 +- requirements.txt | 2 +- setup.py | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 57fd2b3..57470f8 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -566,7 +566,7 @@ def check_requirements(): def download_requirements(models_ok=True): urls = [] if not models_ok: - models_base = "http://corpling.uis.georgetown.edu/amir/download/heb_models_v2/" + models_base = "http://gucorpling.org/amir/download/heb_models_v2/" urls.append(models_base + "heb.sm" + str(sys.version_info[0])) urls.append(models_base + "heb.diaparser") urls.append(models_base + "heb.sent") diff --git a/hebpipe/lib/_version.py b/hebpipe/lib/_version.py index fc2b54c..b92e3cd 100644 --- a/hebpipe/lib/_version.py +++ b/hebpipe/lib/_version.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -__version__ = "2.0.0.2" +__version__ = "2.0.1.0" __author__ = "Amir Zeldes" __copyright__ = "Copyright 2018-2022, Amir Zeldes" __license__ = "Apache 2.0 License" diff --git a/requirements.txt b/requirements.txt index 89c70d0..0e0e97d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ depedit pandas joblib xmltodict -diaparser +diaparser==1.1.2 flair==0.6.1 stanza conllu diff --git a/setup.py b/setup.py index 4de62ab..419a20b 100644 --- a/setup.py +++ b/setup.py @@ -3,16 +3,16 @@ setup( name = 'hebpipe', packages = find_packages(), - version = '2.0.0.2', + version = '2.0.1.0', description = 'A pipeline for Hebrew NLP', author = 'Amir Zeldes', author_email = 'amir.zeldes@georgetown.edu', package_data = {'':['README.md','LICENSE.md','requirements.txt'],'hebpipe':['lib/*','data/*','bin/*','models/models_go_here.txt','models/stanza/stanza_models_here.txt']}, install_requires=['numpy','transformers==3.5.1','torch==1.6.0','pandas','scipy','joblib','xgboost==0.81','rftokenizer','depedit','xmltodict', - 'diaparser','flair==0.6.1','stanza','conllu'], + 'diaparser==1.1.2','flair==0.6.1','stanza','conllu'], url = 'https://github.com/amir-zeldes/HebPipe', license='Apache License, Version 2.0', - download_url = 'https://github.com/amir-zeldes/HebPipe/releases/tag/v2.0.0.2', + download_url = 'https://github.com/amir-zeldes/HebPipe/releases/tag/v2.0.1.0', keywords = ['NLP', 'Hebrew', 'segmentation', 'tokenization', 'tagging', 'parsing','morphology','POS','lemmatization'], classifiers = ['Programming Language :: Python', 'Programming Language :: Python :: 2', From d2725d01c9d64c25cf2216df0d370c58a600ae2a Mon Sep 17 00:00:00 2001 From: nitin Date: Thu, 8 Sep 2022 12:45:36 +0800 Subject: [PATCH 24/32] integrated with the pipeline --- README.md | 9 +- hebpipe/heb_pipe.py | 208 ++++++++++---------------------- hebpipe/lib/crfutils/crf.py | 7 +- hebpipe/lib/crfutils/viterbi.py | 15 ++- hebpipe/lib/mtlmodel.py | 66 +++++----- 5 files changed, 119 insertions(+), 186 deletions(-) diff --git a/README.md b/README.md index 14dc74b..ab1040b 100644 --- a/README.md +++ b/README.md @@ -97,9 +97,8 @@ standard module options: forms -t, --tokenize Tokenize large word forms into smaller morphological segments - -p, --pos Do POS tagging + -g, --posmorph Do POS tagging and Morphological Tagging -l, --lemma Do lemmatization - -m, --morph Do morphological tagging -d, --dependencies Parse with dependency parser -e, --entities Add entity spans and types -c, --coref Add coreference annotations @@ -123,7 +122,7 @@ less common options: Whitespace tokenize, tokenize morphemes, add pos, lemma, morph, dep parse with automatic sentence splitting, entity recognition and coref for one text file, output in default conllu format: -> python heb_pipe.py -wtplmdec example_in.txt +> python heb_pipe.py -wtgldec example_in.txt OR specify no processing options (automatically assumes you want all steps) > python heb_pipe.py example_in.txt @@ -132,10 +131,10 @@ Just tokenize a file using pipes: > python heb_pipe.py -wt -o pipes example_in.txt Pos tag, lemmatize, add morphology and parse a pre-tokenized file, splitting sentences by existing tags: -> python heb_pipe.py -plmd -s sent example_in.txt +> python heb_pipe.py -gld -s sent example_in.txt Add full analyses to a whole directory of *.txt files, output to a specified directory: -> python heb_pipe.py -wtplmdec --dirout /home/heb/out/ *.txt +> python heb_pipe.py -wtgldec --dirout /home/heb/out/ *.txt Parse a tagged TT SGML file into CoNLL tabular format for treebanking, use existing tag to recognize sentence borders: > python heb_pipe.py -d -s sent example_in.tt diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 18b7542..6b84f9a 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -93,12 +93,10 @@ def log_tasks(opts): sys.stderr.write("o Whitespace tokenization\n") if opts.tokenize: sys.stderr.write("o Morphological segmentation\n") - if opts.pos: - sys.stderr.write("o POS tagging\n") + if opts.posmorph: + sys.stderr.write("o POS and Morphological tagging\n") if opts.lemma: sys.stderr.write("o Lemmatization\n") - if opts.morph: - sys.stderr.write("o Morphological analysis\n") if opts.dependencies: sys.stderr.write("o Dependency parsing\n") if opts.entities: @@ -110,15 +108,14 @@ def log_tasks(opts): def diagnose_opts(opts): - if not opts.pos and not opts.morph and not opts.whitespace and not opts.tokenize and not opts.lemma \ + if not opts.posmorph and not opts.whitespace and not opts.tokenize and not opts.lemma \ and not opts.dependencies and not opts.entities and not opts.coref: if not opts.quiet: sys.stderr.write("! You selected no processing options\n") sys.stderr.write("! Assuming you want all processing steps\n") opts.whitespace = True opts.tokenize = True - opts.pos = True - opts.morph = True + opts.posmorph = True opts.lemma = True opts.dependencies = True opts.entities = True @@ -128,15 +125,13 @@ def diagnose_opts(opts): trigger = "" if opts.dependencies: trigger = "depenedencies" - if not opts.pos: - added.append("pos") - opts.pos = True + if not opts.posmorph: + added.append("posmorph") + opts.posmorph = True if not opts.lemma: added.append("lemma") opts.lemma = True - if not opts.morph: - added.append("morph") - opts.morph = True + if len(added)>0: sys.stderr.write("! You selected "+trigger+"\n") sys.stderr.write("! Turning on options: "+",".join(added) +"\n") @@ -552,8 +547,7 @@ def add_feat(morph, feat): def check_requirements(): models_OK = True model_files = ["heb.sm" + str(sys.version_info[0]), "heb.xrm", - "heb.flair","heb.morph", - "heb.sent","heb.diaparser", + "heb.diaparser", "heb.sbdposmorph.pt", "stanza" + os.sep + "he_lemmatizer.pt", "stanza" + os.sep + "he_htb.pretrain.pt", ] @@ -571,12 +565,10 @@ def download_requirements(models_ok=True): models_base = "http://corpling.uis.georgetown.edu/amir/download/heb_models_v2/" urls.append(models_base + "heb.sm" + str(sys.version_info[0])) urls.append(models_base + "heb.diaparser") - urls.append(models_base + "heb.sent") urls.append(models_base + "heb.xrm") - urls.append(models_base + "heb.flair") - urls.append(models_base + "heb.morph") urls.append(models_base + "he_htb.pretrain.pt") urls.append(models_base + "he_lemmatizer.pt") + urls.append(models_base + 'heb.sbdposmorph.pt') for u in urls: sys.stderr.write("o Downloading from " + str(u) + "\n") base_name = u[u.rfind("/") + 1:] @@ -588,7 +580,7 @@ def download_requirements(models_ok=True): def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, do_parse=True, do_entity=True, - out_mode="conllu", sent_tag=None, preloaded=None, punct_sentencer=False, from_pipes=False, filecount=1): + out_mode="conllu", sent_tag=None, preloaded=None, from_pipes=False,cpu=False): data = input_data.replace("\t","") data = data.replace("\r","") @@ -596,16 +588,14 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, if from_pipes: input_data = input_data.replace("|","") - mtltagger = Tagger(trainflag=False,bestmodelpath='data/checkpoint/',sequencelength=256) - - if preloaded is not None: - rf_tok, xrenner, _, parser, _, _, lemmatizer = preloaded + rf_tok, xrenner, mtltagger,parser, lemmatizer = preloaded else: rf_tok = RFTokenizer(model=model_dir + "heb.sm" + str(sys.version_info[0])) xrenner = Xrenner(model=model_dir + "heb.xrm") parser = None if not do_parse else Parser.load(model_dir + "heb.diaparser",verbose=False) lemmatizer = None if not do_lemma and not do_tag else init_lemmatizer() + mtltagger = Tagger(trainflag=False, bestmodelpath=model_dir, sequencelength=256,cpu=cpu) if do_whitespace: data = whitespace_tokenize(data, abbr=data_dir + "heb_abbr.tab",add_sents=sent_tag=="auto", from_pipes=from_pipes) @@ -622,140 +612,63 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, bound_group_map = get_bound_group_map(tokenized) if out_mode == "conllu" else None - """ - if sent_tag == "auto": - sent_tag = "s" - if punct_sentencer: - tokenized = toks_to_sents(tokenized) - else: - tokenized = flair_sent_splitter.split(tokenized) - if filecount == 1: - # Free up GPU memory if no more files need it - del flair_sent_splitter - torch.cuda.empty_cache() - + if mtltagger: + tagged_conllu, tokenized, morphs, words = mtltagger.predict(tokenized,sent_tag=sent_tag,checkpointfile=model_dir + 'heb.sbdposmorph.pt') + if out_mode == "pipes": return tokenized else: tokenized = tokenized.split("\n") retokenized = [] for line in tokenized: - if line == "|": + if line == '|': retokenized.append(line) else: retokenized.append("\n".join(line.split("|"))) - tokenized = "\n".join(retokenized) - - if do_tag: - # Flair - to_tag = conllize(tokenized,element="s",super_mapping=bound_group_map,attrs_as_comments=True) - tagged_conllu = tagger.predict(to_tag, in_format="conllu", as_text=True) - # Uncomment to test lemmatizer with gold POS tags - #tagged_conllu = io.open("he_htb-ud-test.conllu",encoding="utf8").read() - #opts = type('', (), {"quiet":False, "kill":"both"})() - #d = DepEdit(config_file=[],options=opts) - #tagged_conllu = d.run_depedit(tagged_conllu) - pos_tags = [l.split("\t")[3] for l in tagged_conllu.split("\n") if "\t" in l] - """ - - sent_tag = 's' - tagged_conllu, tokenized, morphs, words = mtltagger.predict(tokenized,checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/top_wiki_best_mtlmodel_38.503373_0.890511_0.972855_0.72344.pt') + tokenized = "\n".join(retokenized) + + if sent_tag == 'auto': sent_tag = 's' del mtltagger del rf_tok torch.cuda.empty_cache() - zeros = ["0" for i in range(len(morphs))] - zero_conllu = inject_col(zeros, tagged_conllu, into_col=6, skip_supertoks=True) - lemmas = lemmatize(lemmatizer, zero_conllu, morphs) - tagged = inject_col(tagged_conllu, tokenized, 4) - - if do_lemma: - lemmatized = inject_col(lemmas, tagged, -1) - else: - lemmatized = tagged - - morphs = postprocess_morph(morphs, words, lemmas) - morphed = inject_col(morphs, lemmatized, -1) - - """ if do_tag: - - #morpher = None - if morpher is None: - # Marmot - if platform.system() == "Windows": - tag = ["java","-Dfile.encoding=UTF-8","-Xmx2g","-cp","marmot.jar;trove.jar","marmot.morph.cmd.Annotator","-model-file","heb.marmot","-lemmatizer-file","heb.lemming","-test-file","form-index=0,tempfilename","-pred-file","tempfilename2"] - else: - tag = ["java","-Dfile.encoding=UTF-8","-Xmx2g","-cp","marmot.jar:trove.jar","marmot.morph.cmd.Annotator","-model-file","heb.marmot","-lemmatizer-file","heb.lemming","-test-file","form-index=0,tempfilename","-pred-file","tempfilename2"] - no_sent = re.sub(r']+)?>\n?','',tokenized).strip() - morphed = exec_via_temp(no_sent, tag, workdir=marmot_path, outfile=True) - morphed = morphed.strip().split("\n") - # Clean up tags for OOV glyphs - cleaned = [] - toknum = 0 - for line in morphed: - if "\t" in line: - fields = line.split("\t") - fields[5] = pos_tags[toknum] # Insert flair tags - if fields[1] in KNOWN_PUNCT: # Hard fix unicode punctuation - fields[5] = "PUNCT" - fields[7] = "_" - line = "\t".join(fields) - toknum += 1 - cleaned.append(line) - # morphed = cleaned - morphs = get_col(morphed, 7) - words = get_col(morphed, 1) - lemmas = get_col(morphed, 3) - tagged = inject_col(morphed, tokenized, 5) - else: - # flair - morphed = morpher.predict(tagged_conllu, in_format="conllu", as_text=True, tags=True) - morphs = get_col(morphed, 4) - words = get_col(morphed, 1) - # Uncomment to test with gold morphology from tagged_conllu - #morphs = get_col(tagged_conllu, 5) - #morphed = inject_col(morphs, tagged_conllu, into_col=5, skip_supertoks=True) - zeros = ["0" for i in range(len(morphs))] - zero_conllu = inject_col(zeros,tagged_conllu,into_col=6, skip_supertoks=True) - lemmas = lemmatize(lemmatizer,zero_conllu,morphs) - tagged = inject_col(tagged_conllu,tokenized,4) + zeros = ["0" for i in range(len(morphs))] + zero_conllu = inject_col(zeros, tagged_conllu, into_col=6, skip_supertoks=True) + lemmas = lemmatize(lemmatizer, zero_conllu, morphs) + tagged = inject_col(tagged_conllu, tokenized, 4) if do_lemma: - lemmatized = inject_col(lemmas,tagged,-1) + lemmatized = inject_col(lemmas, tagged, -1) else: lemmatized = tagged morphs = postprocess_morph(morphs, words, lemmas) - morphed = inject_col(morphs,lemmatized,-1) - """ + morphed = inject_col(morphs, lemmatized, -1) - if not do_parse: - if out_mode == "conllu": - conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True) - conllized = add_space_after(input_data,conllized) - return conllized - else: - if not PY3: - morphed = morphed.decode("utf8") - return morphed - - """ - else: + if not do_parse: if out_mode == "conllu": - conllized = conllize(tokenized, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, attrs_as_comments=True) - conllized = add_space_after(input_data, conllized) + conllized = add_space_after(input_data,conllized) return conllized else: - return tokenized - """ + if not PY3: + morphed = morphed.decode("utf8") + return morphed else: - conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True, ten_cols=True) + if out_mode == "conllu": + conllized = conllize(tokenized, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + attrs_as_comments=True, ten_cols=True) + conllized = add_space_after(input_data,conllized) + return conllized + else: + return tokenized + + if do_parse: + conllized = conllize(morphed,tag="PUNCT",element=sent_tag,no_zero=True,super_mapping=bound_group_map,attrs_as_comments=True,ten_cols=True) parsed = diaparse(parser, conllized) parsed = morph_deped.run_depedit(parsed) @@ -772,6 +685,13 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, else: parsed = add_space_after(input_data,parsed) return parsed + else: + if out_mode == "conllu": + conllized = conllize(tagged,tag="PUNCT",element=sent_tag,no_zero=True,super_mapping=bound_group_map,attrs_as_comments=True) + conllized = add_space_after(input_data,conllized) + return conllized + else: + return tagged def run_hebpipe(): @@ -812,9 +732,8 @@ def run_hebpipe(): g1 = parser.add_argument_group("standard module options") g1.add_argument("-w","--whitespace", action="store_true", help='Perform white-space based tokenization of large word forms') g1.add_argument("-t","--tokenize", action="store_true", help='Tokenize large word forms into smaller morphological segments') - g1.add_argument("-p","--pos", action="store_true", help='Do POS tagging') + g1.add_argument("-g","--posmorph", action="store_true", help='Do POS and Morph tagging') g1.add_argument("-l","--lemma", action="store_true", help='Do lemmatization') - g1.add_argument("-m","--morph", action="store_true", help='Do morphological tagging') g1.add_argument("-d","--dependencies", action="store_true", help='Parse with dependency parser') g1.add_argument("-e","--entities", action="store_true", help='Add entity spans and types') g1.add_argument("-c","--coref", action="store_true", help='Add coreference annotations') @@ -857,11 +776,11 @@ def run_hebpipe(): log_tasks(opts) # Check if models, Marmot and Malt Parser are available - if opts.pos or opts.lemma or opts.morph or opts.dependencies or opts.tokenize or opts.entities: + if opts.posmorph or opts.lemma or opts.dependencies or opts.tokenize or opts.entities: models_OK = check_requirements() if not models_OK: sys.stderr.write("! You are missing required software:\n") - if (opts.pos or opts.lemma or opts.morph): + if (opts.posmorph or opts.lemma): sys.stderr.write(" - Tagging, lemmatization and morphological analysis require models\n") if not models_OK: sys.stderr.write(" - Model files in models/ are missing\n") @@ -871,16 +790,16 @@ def run_hebpipe(): else: sys.stderr.write("Aborting\n") sys.exit(0) - #tagger = FlairTagger() - tagger = None - #morpher = FlairTagger(morph=True) - morpher = None + lemmatizer = init_lemmatizer(cpu=opts.cpu, no_post_process=opts.disable_lex) else: - tagger = None - morpher = None lemmatizer = None + if opts.posmorph: + mtltagger = Tagger(trainflag=False, bestmodelpath=model_dir, sequencelength=256,cpu=opts.cpu) + else: + mtltagger = None + if dotok: # Pre-load stacked tokenizer for entire batch rf_tok = RFTokenizer(model=model_dir + "heb.sm" + str(sys.version_info[0])) else: @@ -889,8 +808,7 @@ def run_hebpipe(): xrenner = Xrenner(model=model_dir + "heb.xrm") else: xrenner = None - #flair_sent_splitter = FlairSentSplitter() if opts.sent == "auto" and not opts.punct_sentencer else None - flair_sent_splitter = None + dep_parser = Parser.load(model_dir+"heb.diaparser") if opts.dependencies else None for infile in files: @@ -906,14 +824,14 @@ def run_hebpipe(): sys.stderr.write("Processing " + base + "\n") try: - input_text = io.open(infile,encoding="utf8").read().replace("\r","") + input_text = io.open(infile,encoding="utf8").read().replace("\r","").replace('\n'," ") except UnicodeDecodeError: # Fallback to support Windows Hebrew encoding input_text = io.open(infile,encoding="cp1255").read().replace("\r","") - processed = nlp(input_text, do_whitespace=opts.whitespace, do_tok=dotok, do_tag=opts.pos, do_lemma=opts.lemma, + processed = nlp(input_text, do_whitespace=opts.whitespace, do_tok=dotok, do_tag=opts.posmorph, do_lemma=opts.lemma, do_parse=opts.dependencies, do_entity=opts.entities, out_mode=opts.out, - sent_tag=opts.sent, preloaded=(rf_tok,xrenner,flair_sent_splitter,dep_parser, tagger, morpher, lemmatizer), - punct_sentencer=opts.punct_sentencer,from_pipes=opts.from_pipes, filecount=len(files)) + sent_tag=opts.sent, preloaded=(rf_tok,xrenner,mtltagger,dep_parser,lemmatizer), + from_pipes=opts.from_pipes,cpu=opts.cpu) if len(files) > 1: with io.open(opts.dirout + os.sep + outfile, 'w', encoding="utf8", newline="\n") as f: diff --git a/hebpipe/lib/crfutils/crf.py b/hebpipe/lib/crfutils/crf.py index aa99d17..44d38b8 100644 --- a/hebpipe/lib/crfutils/crf.py +++ b/hebpipe/lib/crfutils/crf.py @@ -10,7 +10,7 @@ class CRF(torch.nn.Module): but also on previous seen annotations. """ - def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool): + def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool,cpu=False): """ :param tag_dictionary: tag dictionary in order to find ID for start and stop tags :param tagset_size: number of tag from tag dictionary @@ -20,7 +20,10 @@ def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool) self.tagset_size = tagset_size - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + if cpu == False: + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + else: + self.device = 'cpu' # Transitions are used in the following way: transitions[to, from]. self.transitions = torch.nn.Parameter(torch.randn(tagset_size, tagset_size)) diff --git a/hebpipe/lib/crfutils/viterbi.py b/hebpipe/lib/crfutils/viterbi.py index 858f845..fe5ba92 100644 --- a/hebpipe/lib/crfutils/viterbi.py +++ b/hebpipe/lib/crfutils/viterbi.py @@ -16,7 +16,7 @@ class ViterbiLoss(torch.nn.Module): Calculates the loss for each sequence up to its length t. """ - def __init__(self, tag_dictionary: Dictionary): + def __init__(self, tag_dictionary: Dictionary,cpu=False): """ :param tag_dictionary: tag_dictionary of task """ @@ -26,8 +26,10 @@ def __init__(self, tag_dictionary: Dictionary): self.start_tag = tag_dictionary.get_idx_for_item(START_TAG) self.stop_tag = tag_dictionary.get_idx_for_item(STOP_TAG) - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - + if cpu == False: + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + else: + self.device = "cpu" def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor: """ @@ -129,7 +131,7 @@ class ViterbiDecoder: Decodes a given sequence using the Viterbi algorithm. """ - def __init__(self, tag_dictionary: Dictionary): + def __init__(self, tag_dictionary: Dictionary,cpu=False): """ :param tag_dictionary: Dictionary of tags for sequence labeling task """ @@ -138,7 +140,10 @@ def __init__(self, tag_dictionary: Dictionary): self.start_tag = tag_dictionary.get_idx_for_item(START_TAG) self.stop_tag = tag_dictionary.get_idx_for_item(STOP_TAG) - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + if cpu == False: + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + else: + self.device = 'cpu' def decode( diff --git a/hebpipe/lib/mtlmodel.py b/hebpipe/lib/mtlmodel.py index c56d83d..6c53083 100644 --- a/hebpipe/lib/mtlmodel.py +++ b/hebpipe/lib/mtlmodel.py @@ -43,10 +43,13 @@ def __init__(self, start, end): class MTLModel(nn.Module): - def __init__(self,sbdrnndim=256,posrnndim=512,morphrnndim=512,sbdrnnnumlayers=1,posrnnnumlayers=1,morphrnnnumlayers=1,posfflayerdim=512,morphfflayerdim=512,sbdrnnbidirectional=True,posrnnbidirectional=True,morphrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posencodertype='lstm',morphencodertype='lstm',batchsize=32,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5): + def __init__(self,sbdrnndim=256,posrnndim=512,morphrnndim=512,sbdrnnnumlayers=1,posrnnnumlayers=1,morphrnnnumlayers=1,posfflayerdim=512,morphfflayerdim=512,sbdrnnbidirectional=True,posrnnbidirectional=True,morphrnnbidirectional=True,sbdencodertype='lstm',sbdfflayerdim=256,posencodertype='lstm',morphencodertype='lstm',batchsize=32,sequencelength=256,dropout=0.0,wordropout=0.05,lockeddropout=0.5,cpu=False): super(MTLModel,self).__init__() - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + if cpu == False: + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + else: + self.device = "cpu" # tagsets - amend labels here self.postagset = {'ADJ':0, 'ADP':1, 'ADV':2, 'AUX':3, 'CCONJ':4, 'DET':5, 'INTJ':6, 'NOUN':7, 'NUM':8, 'PRON':9, 'PROPN':10, 'PUNCT':11, 'SCONJ':12, 'SYM':13, 'VERB':14, 'X':15} # derived from HTB and IAHLTWiki trainsets #TODO: add other UD tags? @@ -60,18 +63,8 @@ def __init__(self,sbdrnndim=256,posrnndim=512,morphrnndim=512,sbdrnnnumlayers=1, self.postagsetcrf.add_item("") self.postagsetcrf.add_item("") - """ # FEATS dictionary # IMPORTANT: This should be sorted by key - self.featstagset = {'Abbr=Yes':0, 'Aspect=Prog':1, 'Case=Acc':2, 'Case=Gen':3, 'Definite=Com':4, 'Definite=Cons':5, 'Definite=Def':6, 'Definite=Ind':7, 'Definite=Spec':8, - 'Foreign=Yes':9, 'Gender=Fem':10, 'Gender=Masc':11, 'HebBinyan=HIFIL':12, 'HebBinyan=HITPAEL':13, 'HebBinyan=HUFAL':14, 'HebBinyan=NIFAL':15, - 'HebBinyan=NITPAEL':16, 'HebBinyan=PAAL':17, 'HebBinyan=PIEL':18, 'HebBinyan=PUAL':19, 'Mood=Imp':20, - 'NumType=Card':21, 'NumType=Ord':22, 'Number=Dual':23, 'Number=Plur':24, 'Number=Sing':25, 'Person=1':26, - 'Person=2':27, 'Person=3':28, 'Polarity=Neg':29, 'Polarity=Pos':30, 'Poss=Yes':31, 'Prefix=Yes':32, 'PronType=Art':33, 'PronType=Dem':34, - 'PronType=Emp':35, 'PronType=Ind':36, 'PronType=Int':37, 'PronType=Prs':38, 'Reflex=Yes':39, 'Tense=Fut':40, 'Tense=Past':41, 'Tense=Pres':42, - 'Typo=Yes':43, 'VerbForm=Inf':44, 'VerbForm=Part':45, 'VerbType=Cop':46, 'VerbType=Mod':47, 'Voice=Act':48, 'Voice=Mid':49, 'Voice=Pass':50} - - """ self.featstagset = {'Definite=Com':0, 'Definite=Cons':1, 'Definite=Def':2, 'Definite=Ind':3, 'Definite=Spec':4, 'Gender=Fem':5, 'Gender=Masc':6, 'HebBinyan=HIFIL':7, 'HebBinyan=HITPAEL':8, 'HebBinyan=HUFAL':9, 'HebBinyan=NIFAL':10, 'HebBinyan=NITPAEL':11, 'HebBinyan=PAAL':12, 'HebBinyan=PIEL':13, 'HebBinyan=PUAL':14, 'Number=Dual':15, 'Number=Plur':16, 'Number=Sing':17, @@ -253,8 +246,8 @@ def __init__(self,sbdrnndim=256,posrnndim=512,morphrnndim=512,sbdrnnnumlayers=1, self.worddropout = WordDropout(wordropout) self.lockeddropout = LockedDropout(lockeddropout) - self.poscrf = CRF(self.postagsetcrf,len(self.postagsetcrf),init_from_state_dict=False).to(self.device) # TODO: parameterize - self.viterbidecoder = ViterbiDecoder(self.postagsetcrf) + self.poscrf = CRF(self.postagsetcrf,len(self.postagsetcrf),init_from_state_dict=False,cpu=cpu).to(self.device) # TODO: parameterize + self.viterbidecoder = ViterbiDecoder(self.postagsetcrf,cpu=cpu) for name, param in self.poscrf.named_parameters(): if 'bias' in name: @@ -596,9 +589,9 @@ def forward(self,data,mode='train'): return sbdlogits, finalsbdlabels, sbdpreds, poslogits, poslabels, pospreds, featslogits,featslabels # returns the logits and labels class Tagger(): - def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=256,sbdrnnnumlayers=1,sbdrnnbidirectional=True,sbdfflayerdim=256,posrnndim=512,posrnnnumlayers=1,posrnnbidirectional=True,posfflayerdim=512,morphrnndim=512,morphrnnnumlayers=1,morphrnnbidirectional=True,morphfflayerdim=512,morphencodertype='lstm',dropout=0.05,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',posencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/',batchsize=32,sequencelength=256,datatype='htb'): + def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdrnndim=256,sbdrnnnumlayers=1,sbdrnnbidirectional=True,sbdfflayerdim=256,posrnndim=512,posrnnnumlayers=1,posrnnbidirectional=True,posfflayerdim=512,morphrnndim=512,morphrnnnumlayers=1,morphrnnbidirectional=True,morphfflayerdim=512,morphencodertype='lstm',dropout=0.05,wordropout=0.05,lockeddropout=0.5,sbdencodertype='lstm',posencodertype='lstm',learningrate = 0.001,bestmodelpath='../data/checkpoint/',batchsize=32,sequencelength=256,datatype='htb',cpu=False): - self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posrnndim=posrnndim,posrnnbidirectional=posrnnbidirectional,posencodertype=posencodertype,posrnnnumlayers=posrnnnumlayers,posfflayerdim=posfflayerdim,morphrnndim=morphrnndim,morphrnnnumlayers=morphrnnnumlayers,morphencodertype=morphencodertype,morphrnnbidirectional=morphrnnbidirectional,morphfflayerdim=morphfflayerdim,batchsize=batchsize,sequencelength=sequencelength) + self.mtlmodel = MTLModel(sbdrnndim=sbdrnndim,sbdrnnnumlayers=sbdrnnnumlayers,sbdrnnbidirectional=sbdrnnbidirectional,sbdencodertype=sbdencodertype,sbdfflayerdim=sbdfflayerdim,dropout=dropout,wordropout=wordropout,lockeddropout=lockeddropout,posrnndim=posrnndim,posrnnbidirectional=posrnnbidirectional,posencodertype=posencodertype,posrnnnumlayers=posrnnnumlayers,posfflayerdim=posfflayerdim,morphrnndim=morphrnndim,morphrnnnumlayers=morphrnnnumlayers,morphencodertype=morphencodertype,morphrnnbidirectional=morphrnnbidirectional,morphfflayerdim=morphfflayerdim,batchsize=batchsize,sequencelength=sequencelength,cpu=cpu) if trainflag == True: @@ -617,8 +610,10 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr self.bestmodel = bestmodelpath + datatype + '_best_mtlmodel.pt' - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - + if cpu == False: + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + else: + self.device = "cpu" self.trainflag = trainflag self.trainfile = trainfile @@ -628,7 +623,7 @@ def __init__(self,trainflag=False,trainfile=None,devfile=None,testfile=None,sbdr self.learningrate = learningrate # Loss for pos tagging - self.postagloss = ViterbiLoss(self.mtlmodel.postagsetcrf) + self.postagloss = ViterbiLoss(self.mtlmodel.postagsetcrf,cpu=cpu) self.postagloss.to(self.device) # Loss for sentence splitting @@ -913,7 +908,7 @@ def read_file(mode='train'): bestmodel = self.bestmodel.replace('.pt','_' + str(round(mtlloss,6)) + '_' + str(round(sbdscores.f1,6)) + '_' + str(round(posscores.f1,6)) + '_' + str(round(featsscores.f1,6)) + '.pt') torch.save({'epoch':epoch,'model_state_dict':self.mtlmodel.state_dict(),'optimizer_state_dict':self.optimizer.state_dict(),'poscrf_state_dict':self.mtlmodel.poscrf.state_dict()},bestmodel) - def inference(self,toks,checkpointfile=None): + def inference(self,toks,sent_tag='auto',checkpointfile=None): def is_tok(sgml_line): return len(sgml_line) > 0 and not (sgml_line.startswith("<") and sgml_line.endswith(">")) @@ -930,6 +925,8 @@ def unescape(token): toks = unescape(toks) # Splitter is trained on UTF-8 forms, since LM embeddings know characters like '&' lines = toks.strip().split("\n") + taggedsbdpreds = [] + # add super token tags supertokenlabels = [] for i in range(0,len(lines)): @@ -952,10 +949,18 @@ def unescape(token): else: supertokenlabels.append("I") + if sent_tag != 'auto': + if i != 2 and lines[i - 2] == "<" + sent_tag + ">": + taggedsbdpreds.append(1) + else: + taggedsbdpreds.append(0) + toks = [l for l in lines if is_tok(l)] toks = [re.sub(r"\t.*", "", t) for t in toks] assert len(toks) == len(supertokenlabels) + if sent_tag != 'auto': + assert len(taggedsbdpreds) == len(toks) # slice up the token list into slices of seqlen for GPU RAM reasons for idx in range(0, len(toks), self.mtlmodel.sequence_length): @@ -974,7 +979,7 @@ def unescape(token): if checkpointfile is not None: - checkpoint = torch.load(checkpointfile) + checkpoint = torch.load(checkpointfile,map_location=self.device) self.mtlmodel.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) self.mtlmodel.poscrf.load_state_dict(checkpoint['poscrf_state_dict']) @@ -1022,6 +1027,10 @@ def unescape(token): allpospreds = [self.mtlmodel.postagsetcrf.get_item_for_index(p) for p in allpospreds] + + if sent_tag != 'auto': + allsbdpreds = taggedsbdpreds + return allsbdpreds,allpospreds, allfeatspreds, allwords def prepare_data_files(self): @@ -1092,7 +1101,7 @@ def read_conllu(self,mode='train'): with open(file, "r", encoding="utf-8") as f: return conllu.parse(f.read(), fields=fields) - def predict(self, xml_data,out_mode='conllu',checkpointfile = None): + def predict(self, xml_data,out_mode='conllu',sent_tag='auto',checkpointfile = None): def is_sgml_tag(line): return line.startswith("<") and line.endswith(">") @@ -1257,7 +1266,7 @@ def get_bound_group_map(data): # don't feed the sentencer our pos and lemma predictions, if we have them no_pos_lemma = re.sub(r"([^\n\t]*?)\t[^\n\t]*?\t[^\n\t]*?\n", r"\1\n", xml_data) - split_indices, pos_tags, morphs, words = self.inference(no_pos_lemma,checkpointfile=checkpointfile) + split_indices, pos_tags, morphs, words = self.inference(no_pos_lemma,sent_tag=sent_tag,checkpointfile=checkpointfile) # for xml counter = 0 @@ -1310,20 +1319,20 @@ def get_bound_group_map(data): lines = reorder(lines) # Split out the internal tags within MWT tokens, as these too get a POS tag - lines = lines.split("\n") + data = lines.split("\n") retokenized = [] - for line in lines: + for line in data: if line == "|": retokenized.append(line) else: retokenized.append("\n".join(line.split("|"))) - lines = "\n".join(retokenized) + data = "\n".join(retokenized) """ Now add the pos tags """ - bound_group_map = get_bound_group_map(lines) if out_mode == "conllu" else None - data = conllize(lines, element="s", super_mapping=bound_group_map, attrs_as_comments=True) + bound_group_map = get_bound_group_map(data) if out_mode == "conllu" else None + data = conllize(data, element="s", super_mapping=bound_group_map, attrs_as_comments=True) data = data.strip() + "\n" # Ensure final new line for last sentence # add the pos tags to conllized file and remove the rows hyphenated MWT ID @@ -1418,7 +1427,6 @@ def main(): # testing only datatype=args.datatype) tagger.prepare_data_files() - # tagger.train(checkpointfile='/home/nitin/Desktop/hebpipe/HebPipe/hebpipe/data/checkpoint/htb_best_sent_pos_model_13.316283_0.979424_0.98009.pt') tagger.train() From 65040c12b517b0391b60297cb695497f240dbd80 Mon Sep 17 00:00:00 2001 From: nitin Date: Thu, 8 Sep 2022 13:02:50 +0800 Subject: [PATCH 25/32] fix formatting indentation --- hebpipe/heb_pipe.py | 180 ++++++++++++++++++++++---------------------- 1 file changed, 90 insertions(+), 90 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 6b84f9a..4797a88 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -582,116 +582,116 @@ def download_requirements(models_ok=True): def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, do_parse=True, do_entity=True, out_mode="conllu", sent_tag=None, preloaded=None, from_pipes=False,cpu=False): - data = input_data.replace("\t","") - data = data.replace("\r","") + data = input_data.replace("\t","") + data = data.replace("\r","") - if from_pipes: - input_data = input_data.replace("|","") + if from_pipes: + input_data = input_data.replace("|","") - if preloaded is not None: - rf_tok, xrenner, mtltagger,parser, lemmatizer = preloaded - else: - rf_tok = RFTokenizer(model=model_dir + "heb.sm" + str(sys.version_info[0])) - xrenner = Xrenner(model=model_dir + "heb.xrm") - parser = None if not do_parse else Parser.load(model_dir + "heb.diaparser",verbose=False) - lemmatizer = None if not do_lemma and not do_tag else init_lemmatizer() - mtltagger = Tagger(trainflag=False, bestmodelpath=model_dir, sequencelength=256,cpu=cpu) + if preloaded is not None: + rf_tok, xrenner, mtltagger,parser, lemmatizer = preloaded + else: + rf_tok = RFTokenizer(model=model_dir + "heb.sm" + str(sys.version_info[0])) + xrenner = Xrenner(model=model_dir + "heb.xrm") + parser = None if not do_parse else Parser.load(model_dir + "heb.diaparser",verbose=False) + lemmatizer = None if not do_lemma and not do_tag else init_lemmatizer() + mtltagger = Tagger(trainflag=False, bestmodelpath=model_dir, sequencelength=256,cpu=cpu) - if do_whitespace: - data = whitespace_tokenize(data, abbr=data_dir + "heb_abbr.tab",add_sents=sent_tag=="auto", from_pipes=from_pipes) + if do_whitespace: + data = whitespace_tokenize(data, abbr=data_dir + "heb_abbr.tab",add_sents=sent_tag=="auto", from_pipes=from_pipes) - if from_pipes: - tokenized = data + if from_pipes: + tokenized = data + else: + if do_tok: + tokenized = rf_tok.rf_tokenize(data.strip().split("\n")) + tokenized = "\n".join(tokenized) else: - if do_tok: - tokenized = rf_tok.rf_tokenize(data.strip().split("\n")) - tokenized = "\n".join(tokenized) - else: - # Assume data is already one token per line - tokenized = data + # Assume data is already one token per line + tokenized = data - bound_group_map = get_bound_group_map(tokenized) if out_mode == "conllu" else None + bound_group_map = get_bound_group_map(tokenized) if out_mode == "conllu" else None - if mtltagger: - tagged_conllu, tokenized, morphs, words = mtltagger.predict(tokenized,sent_tag=sent_tag,checkpointfile=model_dir + 'heb.sbdposmorph.pt') + if mtltagger: + tagged_conllu, tokenized, morphs, words = mtltagger.predict(tokenized,sent_tag=sent_tag,checkpointfile=model_dir + 'heb.sbdposmorph.pt') - if out_mode == "pipes": - return tokenized - else: - tokenized = tokenized.split("\n") - retokenized = [] - for line in tokenized: - if line == '|': - retokenized.append(line) - else: - retokenized.append("\n".join(line.split("|"))) - tokenized = "\n".join(retokenized) + if out_mode == "pipes": + return tokenized + else: + tokenized = tokenized.split("\n") + retokenized = [] + for line in tokenized: + if line == '|': + retokenized.append(line) + else: + retokenized.append("\n".join(line.split("|"))) + tokenized = "\n".join(retokenized) - if sent_tag == 'auto': sent_tag = 's' + if sent_tag == 'auto': sent_tag = 's' - del mtltagger - del rf_tok - torch.cuda.empty_cache() + del mtltagger + del rf_tok + torch.cuda.empty_cache() - if do_tag: - zeros = ["0" for i in range(len(morphs))] - zero_conllu = inject_col(zeros, tagged_conllu, into_col=6, skip_supertoks=True) - lemmas = lemmatize(lemmatizer, zero_conllu, morphs) - tagged = inject_col(tagged_conllu, tokenized, 4) + if do_tag: + zeros = ["0" for i in range(len(morphs))] + zero_conllu = inject_col(zeros, tagged_conllu, into_col=6, skip_supertoks=True) + lemmas = lemmatize(lemmatizer, zero_conllu, morphs) + tagged = inject_col(tagged_conllu, tokenized, 4) - if do_lemma: - lemmatized = inject_col(lemmas, tagged, -1) - else: - lemmatized = tagged + if do_lemma: + lemmatized = inject_col(lemmas, tagged, -1) + else: + lemmatized = tagged - morphs = postprocess_morph(morphs, words, lemmas) - morphed = inject_col(morphs, lemmatized, -1) + morphs = postprocess_morph(morphs, words, lemmas) + morphed = inject_col(morphs, lemmatized, -1) - if not do_parse: - if out_mode == "conllu": - conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True) - conllized = add_space_after(input_data,conllized) - return conllized - else: - if not PY3: - morphed = morphed.decode("utf8") - return morphed - - else: + if not do_parse: if out_mode == "conllu": - conllized = conllize(tokenized, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True, ten_cols=True) + conllized = conllize(morphed, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + attrs_as_comments=True) conllized = add_space_after(input_data,conllized) return conllized else: - return tokenized - - if do_parse: - conllized = conllize(morphed,tag="PUNCT",element=sent_tag,no_zero=True,super_mapping=bound_group_map,attrs_as_comments=True,ten_cols=True) - parsed = diaparse(parser, conllized) - parsed = morph_deped.run_depedit(parsed) - - if do_entity: - xrenner.docname = "_" - ents = xrenner.analyze(parsed,"conll_sent") - ents = get_col(ents, -1) - entified = inject_col(ents, parsed, col=-1, into_col=9, skip_supertoks=True) - entified = add_space_after(input_data,entified) - if PY3: - return entified - else: - return entified.decode("utf8") - else: - parsed = add_space_after(input_data,parsed) - return parsed + if not PY3: + morphed = morphed.decode("utf8") + return morphed + + else: + if out_mode == "conllu": + conllized = conllize(tokenized, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, + attrs_as_comments=True, ten_cols=True) + conllized = add_space_after(input_data,conllized) + return conllized else: - if out_mode == "conllu": - conllized = conllize(tagged,tag="PUNCT",element=sent_tag,no_zero=True,super_mapping=bound_group_map,attrs_as_comments=True) - conllized = add_space_after(input_data,conllized) - return conllized + return tokenized + + if do_parse: + conllized = conllize(morphed,tag="PUNCT",element=sent_tag,no_zero=True,super_mapping=bound_group_map,attrs_as_comments=True,ten_cols=True) + parsed = diaparse(parser, conllized) + parsed = morph_deped.run_depedit(parsed) + + if do_entity: + xrenner.docname = "_" + ents = xrenner.analyze(parsed,"conll_sent") + ents = get_col(ents, -1) + entified = inject_col(ents, parsed, col=-1, into_col=9, skip_supertoks=True) + entified = add_space_after(input_data,entified) + if PY3: + return entified else: - return tagged + return entified.decode("utf8") + else: + parsed = add_space_after(input_data,parsed) + return parsed + else: + if out_mode == "conllu": + conllized = conllize(tagged,tag="PUNCT",element=sent_tag,no_zero=True,super_mapping=bound_group_map,attrs_as_comments=True) + conllized = add_space_after(input_data,conllized) + return conllized + else: + return tagged def run_hebpipe(): From 50c0718527a1b240157501b36ac144a7d1be241a Mon Sep 17 00:00:00 2001 From: nitin Date: Thu, 8 Sep 2022 13:11:05 +0800 Subject: [PATCH 26/32] bugfix --- hebpipe/heb_pipe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 4797a88..e00b257 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -661,7 +661,7 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, else: if out_mode == "conllu": conllized = conllize(tokenized, tag="PUNCT", element=sent_tag, no_zero=True, super_mapping=bound_group_map, - attrs_as_comments=True, ten_cols=True) + attrs_as_comments=True) conllized = add_space_after(input_data,conllized) return conllized else: @@ -850,8 +850,6 @@ def run_hebpipe(): if __name__ == "__main__": import logging - from time import time logging.disable(logging.INFO) - startpipeline = time() run_hebpipe() From 6b32af1d80dd0ad1f0140d6e6aee5594324fc4eb Mon Sep 17 00:00:00 2001 From: amir-zeldes Date: Fri, 30 Sep 2022 13:12:48 -0400 Subject: [PATCH 27/32] add binyan lookup for any VERB without binyan prediction --- hebpipe/heb_pipe.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 8d2155a..a5f637b 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -521,7 +521,7 @@ def diaparse(parser, conllu): return merged -def postprocess_morph(feats, words, lemmas): +def postprocess_morph(feats, words, lemmas, tags): def add_feat(morph, feat): if morph == "_": return feat @@ -535,7 +535,8 @@ def add_feat(morph, feat): for i, lemma in enumerate(lemmas): word = words[i] feat = feats[i] - if "HebBinyan" in feat: # Rely on BERT to notice that binyan is needed + tag = tags[i] + if "HebBinyan" in feat or tag == "VERB": # Rely on BERT to notice that binyan is needed, or if it's a VERB if (word,lemma) in binyan_lookup: feat = add_feat(feat,"HebBinyan=" + binyan_lookup[(word,lemma)]) elif lemma in binyan_lemma_lookup: @@ -562,7 +563,7 @@ def check_requirements(): def download_requirements(models_ok=True): urls = [] if not models_ok: - models_base = "http://gucorpling.org/amir/download/heb_models_v2/" + models_base = "http://gucorpling.org/amir/download/heb_models_v3/" urls.append(models_base + "heb.sm" + str(sys.version_info[0])) urls.append(models_base + "heb.diaparser") urls.append(models_base + "heb.xrm") @@ -644,7 +645,8 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, else: lemmatized = tagged - morphs = postprocess_morph(morphs, words, lemmas) + tags = [l.split("\t")[3] for l in tagged_conllu.split("\n") if "\t" in l] + morphs = postprocess_morph(morphs, words, lemmas, tags) morphed = inject_col(morphs, lemmatized, -1) if not do_parse: @@ -693,6 +695,7 @@ def nlp(input_data, do_whitespace=True, do_tok=True, do_tag=True, do_lemma=True, else: return tagged + def run_hebpipe(): if sys.version_info[0] == 2 and sys.version_info[1] < 7: @@ -718,11 +721,11 @@ def run_hebpipe(): Just tokenize a file using pipes: > python heb_pipe.py -wt -o pipes example_in.txt -Pos tag, lemmatize, add morphology and parse a pre-tokenized file, splitting sentences by existing tags: -> python heb_pipe.py -plmd -s sent example_in.txt +POS tag, lemmatize, add morphology and parse a pre-tokenized file, splitting sentences by existing tags: +> python heb_pipe.py -pld -s sent example_in.txt Add full analyses to a whole directory of *.txt files, output to a specified directory: -> python heb_pipe.py -wtplmdec --dirout /home/heb/out/ *.txt +> python heb_pipe.py -wtpldec --dirout /home/heb/out/ *.txt Parse a tagged TT SGML file into CoNLL tabular format for treebanking, use existing tag to recognize sentence borders: > python heb_pipe.py -d -s sent example_in.tt @@ -732,7 +735,7 @@ def run_hebpipe(): g1 = parser.add_argument_group("standard module options") g1.add_argument("-w","--whitespace", action="store_true", help='Perform white-space based tokenization of large word forms') g1.add_argument("-t","--tokenize", action="store_true", help='Tokenize large word forms into smaller morphological segments') - g1.add_argument("-g","--posmorph", action="store_true", help='Do POS and Morph tagging') + g1.add_argument("-p","--posmorph", action="store_true", help='Do POS and Morph tagging') g1.add_argument("-l","--lemma", action="store_true", help='Do lemmatization') g1.add_argument("-d","--dependencies", action="store_true", help='Parse with dependency parser') g1.add_argument("-e","--entities", action="store_true", help='Add entity spans and types') @@ -775,7 +778,7 @@ def run_hebpipe(): if not opts.quiet: log_tasks(opts) - # Check if models, Marmot and Malt Parser are available + # Check if models are available if opts.posmorph or opts.lemma or opts.dependencies or opts.tokenize or opts.entities: models_OK = check_requirements() if not models_OK: @@ -840,7 +843,7 @@ def run_hebpipe(): f.write((processed.strip() + "\n")) else: # Single file, print to stdout if PY3: - sys.stdout.buffer.write(processed.encode("utf8")) + sys.stdout.buffer.write((processed+"\n\n").encode("utf8")) else: print(processed.encode("utf8")) From 3b95f07d8dbf92f615ac7cf67c42437790591b20 Mon Sep 17 00:00:00 2001 From: amir-zeldes Date: Fri, 30 Sep 2022 13:13:20 -0400 Subject: [PATCH 28/32] Rename posmorph short option to -p --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ab1040b..c78213e 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,7 @@ standard module options: forms -t, --tokenize Tokenize large word forms into smaller morphological segments - -g, --posmorph Do POS tagging and Morphological Tagging + -p, --posmorph Do POS tagging and Morphological Tagging -l, --lemma Do lemmatization -d, --dependencies Parse with dependency parser -e, --entities Add entity spans and types @@ -122,7 +122,7 @@ less common options: Whitespace tokenize, tokenize morphemes, add pos, lemma, morph, dep parse with automatic sentence splitting, entity recognition and coref for one text file, output in default conllu format: -> python heb_pipe.py -wtgldec example_in.txt +> python heb_pipe.py -wtpldec example_in.txt OR specify no processing options (automatically assumes you want all steps) > python heb_pipe.py example_in.txt @@ -131,10 +131,10 @@ Just tokenize a file using pipes: > python heb_pipe.py -wt -o pipes example_in.txt Pos tag, lemmatize, add morphology and parse a pre-tokenized file, splitting sentences by existing tags: -> python heb_pipe.py -gld -s sent example_in.txt +> python heb_pipe.py -pld -s sent example_in.txt Add full analyses to a whole directory of *.txt files, output to a specified directory: -> python heb_pipe.py -wtgldec --dirout /home/heb/out/ *.txt +> python heb_pipe.py -wtpldec --dirout /home/heb/out/ *.txt Parse a tagged TT SGML file into CoNLL tabular format for treebanking, use existing tag to recognize sentence borders: > python heb_pipe.py -d -s sent example_in.tt From 6bfd5e7b2dacc0791aeebd0244aa85e02c4dfa30 Mon Sep 17 00:00:00 2001 From: nitin Date: Fri, 7 Oct 2022 04:35:23 +0800 Subject: [PATCH 29/32] fixes for keeping \n in text --- .gitignore | 30 ++++++++++++++++++++++++++++++ hebpipe/heb_pipe.py | 2 +- hebpipe/lib/mtlmodel.py | 7 ++----- 3 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..18c6aa7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,30 @@ +/hebpipe/conll18_ud_eval.py +/hebpipe/he_htb-ud-dev.conllu +/hebpipe/he_htb-ud-test.conllu +/hebpipe/he_htb-ud-train.conllu +/hebpipe/he_iahltwiki-ud-dev.conllu +/hebpipe/he_iahltwiki-ud-test.conllu +/hebpipe/he_iahltwiki-ud-train.conllu +/hebpipe/htb-test.conllu +/hebpipe/htb-test.txt +/hebpipe/htb-test-split.conllu +/hebpipe/htb-test-split.txt +/hebpipe/iahlt-test.conllu +/hebpipe/iahlt-test.txt +/hebpipe/models/stanza/he_htb.pretrain.pt +/hebpipe/models/stanza/he_htb_tagger.pt +/hebpipe/models/stanza/he_lemmatizer.pt +/hebpipe/models/heb.diaparser +/hebpipe/models/heb.flair +/hebpipe/models/heb.morph +/hebpipe/models/heb.sbdposmorph.pt +/hebpipe/models/heb.seg +/hebpipe/models/heb.sent +/hebpipe/models/heb.sm3 +/hebpipe/models/heb.xrm +/hebpipe/data/checkpoint/ +/hebpipe/data/tensorboarddir/ + +__pycache__ +/hebpipe/data/sentsplit_postag_dev_gold.tab +/hebpipe/data/sentsplit_postag_train_gold.tab diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index a5f637b..7736e21 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -827,7 +827,7 @@ def run_hebpipe(): sys.stderr.write("Processing " + base + "\n") try: - input_text = io.open(infile,encoding="utf8").read().replace("\r","").replace('\n'," ") + input_text = io.open(infile,encoding="utf8").read().replace("\r","") except UnicodeDecodeError: # Fallback to support Windows Hebrew encoding input_text = io.open(infile,encoding="cp1255").read().replace("\r","") diff --git a/hebpipe/lib/mtlmodel.py b/hebpipe/lib/mtlmodel.py index 6c53083..3997f3f 100644 --- a/hebpipe/lib/mtlmodel.py +++ b/hebpipe/lib/mtlmodel.py @@ -398,11 +398,8 @@ def forward(self,data,mode='train'): # Make embeddings and scalar average them across subwords, vertically. sentences = [d.split() for d in sentences] # for AlephBERT tokens = self.tokenizer(sentences,return_tensors='pt',padding=True,is_split_into_words=True,truncation=True).to(self.device) # tell AlephBERT that there is some tokenization already. - try: - output = self.model(**tokens) - except Exception: - print ('here') - raise + + output = self.model(**tokens) hiddenstates = output[2][-self.lastn:] scalarsum = hiddenstates[0] for i in range(1,self.lastn): From 96a48d84c34817aeeab335741dae1f8e61842a2c Mon Sep 17 00:00:00 2001 From: amir-zeldes Date: Mon, 17 Oct 2022 16:05:39 -0400 Subject: [PATCH 30/32] README --- README.md | 51 ++++++++++++++++++----------------- hebpipe/heb_pipe.py | 2 +- hebpipe/lib/partial_morph.ini | 5 +++- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index c78213e..fad8d41 100644 --- a/README.md +++ b/README.md @@ -15,22 +15,18 @@ A simple NLP pipeline for Hebrew text in UTF-8 encoding, using standard componen Note that entity recognition and coreference are still in beta and offer rudimentary accuracy. -Online demo available at: (choose 'Hebrew' and enter plain text) +To cite this tool in academic papers please refer to this paper: -https://corpling.uis.georgetown.edu/xrenner/ +Zeldes, Amir, Nick Howell, Noam Ordan and Yifat Ben Moshe (2022) [A Second Wave of UD Hebrew Treebanking and Cross-Domain Parsing](https://arxiv.org/abs/2210.07873). In: *Proceedings of EMNLP 2022*. Abu Dhabi, UAE. -To cite this work please refer to the paper about the morphological segmenter here: - -Zeldes, Amir (2018) A Characterwise Windowed Approach to Hebrew Morphological Segmentation. In: *Proceedings of the 15th SIGMORPHON Workshop on Computational Research in Phonetics, Phonology, and Morphology*. Brussels, Belgium. ``` -@InProceedings{Zeldes2018, - author = {Amir Zeldes}, - title = {A CharacterwiseWindowed Approach to {H}ebrew Morphological Segmentation}, - booktitle = {Proceedings of the 15th {SIGMORPHON} Workshop on Computational Research in Phonetics, Phonology, and Morphology}, - year = {2018}, - pages = {101--110}, - address = {Brussels, Belgium} +@InProceedings{ZeldesHowellOrdanBenMoshe2022, + author = {Amir Zeldes and Nick Howell and Noam Ordan and Yifat Ben Moshe}, + booktitle = {Proceedings of {EMNLP} 2022}, + title = {A SecondWave of UD Hebrew Treebanking and Cross-Domain Parsing}, + year = {2022}, + address = {Abu Dhabi, UAE}, } ``` @@ -57,18 +53,23 @@ Models can be downloaded automatically by the script on its first run. The NLP pipeline will run on Python 2.7+ or Python 3.5+ (2.6 and lower are not supported). Required libraries: - * requests - * numpy - * scipy - * pandas - * depedit - * xmltodict - * xgboost==0.81 - * rftokenizer - * joblib - * flair==0.6.1 - * stanza - * diaparser +``` +requests +transformers==3.5.1 +torch==1.6.0 +xgboost==0.81 +rftokenizer +numpy +scipy +depedit +pandas +joblib +xmltodict +diaparser==1.1.2 +flair==0.6.1 +stanza +conllu +``` You should be able to install these manually via pip if necessary (i.e. `pip install rftokenizer` etc.). @@ -79,7 +80,7 @@ Note that some older versions of Python + Windows do not install numpy correctly ### Model files -Model files are too large to include in the standard GitHub repository. The software will offer to download them automatically. The latest models can also be downloaded manually at https://corpling.uis.georgetown.edu/amir/download/heb_models_v2/. +Model files are too large to include in the standard GitHub repository. The software will offer to download them automatically. The latest models can also be downloaded manually at https://gucorpling.org/amir/download/heb_models_v3/. ## Command line usage diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index 7736e21..ac58309 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -713,7 +713,7 @@ def run_hebpipe(): -------------- Whitespace tokenize, tokenize morphemes, add pos, lemma, morph, dep parse with automatic sentence splitting, entity recognition and coref for one text file, output in default conllu format: -> python heb_pipe.py -wtplmdec example_in.txt +> python heb_pipe.py -wtpldec example_in.txt OR specify no processing options (automatically assumes you want all steps) > python heb_pipe.py example_in.txt diff --git a/hebpipe/lib/partial_morph.ini b/hebpipe/lib/partial_morph.ini index 04a9369..3578710 100644 --- a/hebpipe/lib/partial_morph.ini +++ b/hebpipe/lib/partial_morph.ini @@ -119,4 +119,7 @@ pos=/NUM/&text=/^[0-9.,]+$/ none #1:morph-=Gender # Prevent blank morph and trailing pipe morph=/(.*)\|$/ none #1:morph=$1 morph=/^\|(.*)/ none #1:morph=$1 -morph=/^$/ none #1:morph=_ \ No newline at end of file +morph=/^$/ none #1:morph=_ + +# HebBinyan is passive but voice feature missing +pos=/VERB/&morph=/.*(HUFAL|PUAL).*/&morph!=/.*Voice.*/ none #1:morph+=Voice=Pass \ No newline at end of file From e535325639fa973a45a0f9999ebfe48c4c5cc085 Mon Sep 17 00:00:00 2001 From: amir-zeldes Date: Tue, 18 Oct 2022 10:04:19 -0400 Subject: [PATCH 31/32] remove flair modules --- hebpipe/heb_pipe.py | 6 - hebpipe/lib/flair/build_sent_data.py | 57 - hebpipe/lib/flair/data/sent_dev.txt | 11583 ------------------------- hebpipe/lib/flair_pos_tagger.py | 420 - hebpipe/lib/flair_sent_splitter.py | 441 - 5 files changed, 12507 deletions(-) delete mode 100644 hebpipe/lib/flair/build_sent_data.py delete mode 100644 hebpipe/lib/flair/data/sent_dev.txt delete mode 100644 hebpipe/lib/flair_pos_tagger.py delete mode 100644 hebpipe/lib/flair_sent_splitter.py diff --git a/hebpipe/heb_pipe.py b/hebpipe/heb_pipe.py index ac58309..544df85 100644 --- a/hebpipe/heb_pipe.py +++ b/hebpipe/heb_pipe.py @@ -20,8 +20,6 @@ from .lib.append_column import inject_col from .lib.sent_split import toks_to_sents from .lib.whitespace_tokenize import add_space_after, tokenize as whitespace_tokenize - from .lib.flair_sent_splitter import FlairSentSplitter - from .lib.flair_pos_tagger import FlairTagger from .lib.mtlmodel import Tagger except ImportError: # direct script usage from lib.xrenner import Xrenner @@ -30,8 +28,6 @@ from lib.append_column import inject_col from lib.sent_split import toks_to_sents from lib.whitespace_tokenize import add_space_after, tokenize as whitespace_tokenize - from lib.flair_sent_splitter import FlairSentSplitter - from lib.flair_pos_tagger import FlairTagger from lib.mtlmodel import Tagger PY3 = sys.version_info[0] > 2 @@ -761,8 +757,6 @@ def run_hebpipe(): opts = diagnose_opts(opts) if opts.cpu: - import flair - flair.device = torch.device('cpu') torch.cuda.is_available = lambda: False dotok = opts.tokenize diff --git a/hebpipe/lib/flair/build_sent_data.py b/hebpipe/lib/flair/build_sent_data.py deleted file mode 100644 index a5cb3cb..0000000 --- a/hebpipe/lib/flair/build_sent_data.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -build_sent_data.py - -Creates train/dev/test data in flair format to train flair_sent_splitter - -Assumes three files in the current directory, e.g. dev.conllu, test.conllu, train.conllu -Files for flair training will be created in the current directory and should be *MOVED* -to lib/flair/data/ before invoking training in flair_sent_splitter.py -""" - -from glob import glob -import os, sys, re, io -from collections import defaultdict - -conll_dir = "" + os.sep # CoNLL-U training data is assumed to be in lib/flair/*.conllu, or set a different path here - -files = glob(conll_dir + "*.conllu") - -data = defaultdict(list) - -for file_ in files: - doc = os.path.basename(file_).replace(".conllu", "") - - partition = "train" - if "test" in file_: - partition = "test" - elif "dev" in file_: - partition = "dev" - - lines = io.open(file_, encoding="utf8").read().strip().split("\n") - - sent = "B-SENT" - counter = 0 - for line in lines: - if len(line.strip()) == 0: - sent = "B-SENT" - if "\t" in line: - fields = line.split("\t") - if "-" in fields[0]: - continue - word = fields[1] - pos = fields[4] - data[partition].append(word + " " + sent) - sent = "O" - counter += 1 - if counter == 21: - #data[partition].append("") - #data[partition].append("-DOCSTART- X") - data[partition].append("") - counter = 0 - - data[partition].append("") - -for partition in data: - lines = data[partition] - with io.open("sent_" + partition + ".txt", "w", encoding="utf8", newline="\n") as f: - f.write("\n".join(lines) + "\n") diff --git a/hebpipe/lib/flair/data/sent_dev.txt b/hebpipe/lib/flair/data/sent_dev.txt deleted file mode 100644 index b97302b..0000000 --- a/hebpipe/lib/flair/data/sent_dev.txt +++ /dev/null @@ -1,11583 +0,0 @@ -עשרות B-SENT -אנשים O -מגיעים O -מ O -תאילנד O -ל O -ישראל O -כש O -הם O -נרשמים O -כ O -מתנדבים O -, O -אך O -למעשה O -משמשים O -עובדים O -שכירים O -זולים O -. O -תופעה B-SENT - -זו O -התבררה O -אתמול O -ב O -וועדת O -ה O -עבודה O -ו O -ה O -רווחה O -של O -ה O -כנסת O -, O -ש O -דנה O -ב O -נושא O -העסקת O -עובדים O -זרים O - -. O -יו"ר B-SENT -ה O -וועדה O -, O -ח"כ O -אורה O -נמיר O -( O -מערך O -) O -, O -טענה O -כי O -" O -מביאים O -עובדים O -זרים O -ל O -ישראל O -על O - -תקן O -של O -מתנדבים O -מ O -תאילנד O -, O -רק O -כדי O -לא O -לשלם O -ל O -הם O -שכר O -מינימום O -. O -מ B-SENT -צד O -אחד O -רוצה O -ה O -אוצר O - -להוריד O -את O -שכר O -ה O -מינימום O -, O -ו O -מ O -צד O -שני O -מתיר O -משרד O -ה O -עבודה O -ו O -ה O -רווחה O -להעסיק O -עובדים O -זרים O -ב O - -פחות O -מ O -שכר O -זה O -" O -. O -נמיר B-SENT -הודיעה O -כי O -תפנה O -ל O -שרי O -ה O -פנים O -ו O -ה O -עבודה O -ו O -ה O -רווחה O -ו O - -ל O -מזכיר O -תנועת O -ה O -מושבים O -, O -ב O -תביעה O -לבטל O -את O -הזמנת O -ם O -של O -500 O -עובדים O -זרים O -מ O -תאילנד O -כ O -מתנדבים O -כביכול O - -. O -היא B-SENT -הודיעה O -כי O -ה O -וועדה O -תגבש O -הצעת O -חוק O -ב O -נושא O -ה O -עובדים O -ה O -זרים O -, O -ש O -תכלול O -איסור O -על O -מתן O - -שכר O -ל O -עובדים O -מתחת O -ל O -שכר O -ה O -מינימום O -ו O -מתן O -ה O -תנאים O -ה O -סוציאליים O -ה O -מקובלים O -ב O -מקום O -ה O -עבודה O -. O - -כמו B-SENT -כן O -, O -תציב O -הצעת O -ה O -חוק O -עונשי O -מאסר O -ו O -הטלת O -קנסות O -כבדים O -ל O -מי O -ש O -יעסיק O -עובדים O -זרים O -בלא O -רשיון O - -. O -מרגלית B-SENT -אילת O -, O -ה O -ממונה O -על O -מתן O -היתרי O -עבודה O -ל O -זרים O -ב O -שירות O -ה O -תעסוקה O -, O -מסרה O -כי O -תנועת O -ה O - -מושבים O -הפעילה O -לחץ O -ש O -יותר O -ל O -ה O -להביא O -עובדים O -זרים O -מ O -תאילנד O -. O -היא B-SENT -אמרה O -כי O -שירות O -ה O -תעסוקה O -הציע O -להביא O - -עובדים O -מ O -דרום O -לבנון O -, O -אך O -תנועת O -ה O -מושבים O -סירבה O -. O -ישראל B-SENT -ארד O -, O -סמנכ"ל O -ה O -ביטוח O -ה O -לאומי O -, O -אמר O - -כי O -ממלא O -מקום O -שר O -ה O -עבודה O -ו O -ה O -רווחה O -, O -דוד O -מגן O -, O -הקים O -ועדה O -בין O -- O -משרדית O -, O -ש O -המליצה O - -להגדיל O -ב O -אופן O -משמעותי O -את O -ה O -קנסות O -ל O -מעסיקים O -. O -ח"כ B-SENT -אלי O -דיין O -( O -מערך O -) O -הגיש O -הצעת O -חוק O -ש O -לפי O - -ה O -יוטל O -היטל O -על O -מעסיקי O -עובדים O -זרים O -, O -כדי O -למנוע O -העדפת O -ם O -על O -עובדים O -ישראליים O -. O -חברות B-SENT -ה O -מעסיקות O -עובדים O -זרים O - -זוכות O -ב O -מכרזים O -, O -היות O -ו O -הן O -מציעות O -שירותים O -זולים O -יותר O -. O -ח"כ B-SENT -רן O -כהן O -( O -רץ O -) O -אמר O -כי O -על O - -ה O -וועדה O -לפנות O -ל O -ממשלה O -ב O -דרישה O -לחסל O -את O -העסקת O -ה O -עובדים O -ה O -זרים O -לאלתר O -, O -על O -רקע O -היצע O -ה O -עולים O - -ה O -מוכנים O -לעבוד O -ב O -כל O -עבודה O -ב O -שכר O -ה O -מינימום O -. O -ל B-SENT -דברי O -ו O -, O -יש O -לפנות O -ל O -משרד O -ה O -עבודה O - -ו O -ה O -רווחה O -ב O -דרישה O -לבטל O -בתוך O -חודש O -את O -עבודת O -ה O -עובדים O -ה O -זרים O -ה O -מועסקים O -כיום O -תחת O -ה O -כותרת O -" O - -מתנדבים O -" O -. O -ח"כ B-SENT -יאיר O -צבן O -( O -מפם O -) O -אמר O -כי O -פרשת O -ה O -מתנדבים O -ה O -תאילנדיים O -היא O -" O -כתם O -חרפה O -על O - -פרצופ O -נו O -ה O -לאומי O -. O -ה B-SENT -מוח O -מתפלץ O -לא O -רק O -מ O -ה O -תופעה O -ה O -מבישה O -אלא O -גם O -מ O -דרכי O -ה O -הערמה O - -. O -ה B-SENT -ראש O -ה O -יהודי O -ממציא O -ל O -נו O -פטנטים O -ב O -דמות O -מתנדבים O -, O -ה O -משולמים O -שכר O -מחפיר O -" O -. O -הוא B-SENT -קרא O - -ל O -הפסקת O -ה O -תופעה O -ו O -ל O -מתן O -תשלומים O -רטרואקטיוויים O -ל O -תאילנדים O -, O -ש O -ישלימו O -את O -שכר O -ה O -מינימום O -. O -דוברת B-SENT -שירות O - -ה O -תעסוקה O -מסרה O -אתמול O -ב O -תגובה O -, O -כי O -ה O -שירות O -פועל O -ל O -צמצום O -מספר O -ם O -של O -ה O -עובדים O -ה O -זרים O -ו O - -ל O -הכנסת O -עולים O -חדשים O -ב O -מקומ O -ם O -. O -" B-SENT -לאחר O -פעילות O -נמרצת O -ב O -שבועות O -ה O -אחרונים O -, O -צומצם O -מספר O -ה O -עובדים O - -ה O -זרים O -ב O -ענף O -ה O -אחיות O -ב O -50 O -% O -. O -עד B-SENT -סוף O -ה O -שנה O -לא O -תועסק O -ב O -ארץ O -אף O -אחות O -כ O - -עובדת O -זרה O -" O -. O -ה B-SENT -דוברת O -אמרה O -כי O -ה O -שירות O -קורא O -ל O -חקלאים O -לדאוג O -ל O -עובדים O -ישראליים O -, O -במקום O -ל O -תאילנדים O - -. O -" B-SENT -ה O -שירות O -לא O -שינה O -את O -ה O -מדיניות O -ב O -נושא O -העסקת O -תאילנדים O -ב O -חקלאות O -, O -אלא O -להיפך O -, O -הוא O -הבהיר O - -ל O -תנועת O -ה O -מושבים O -ש O -כל O -עובד O -ישראלי O -ש O -יסכים O -לעבוד O -ב O -ענף O -ה O -חקלאות O -, O -ישובץ O -ל O -עבודה O -לאלתר O -" O - -. O -מאמר B-SENT -ו O -של O -תום O -שגב O -, O -" O -ה O -קרב O -על O -סן O -סימון O -היה O -או O -לא O -היה O -" O -( O -" O -ה O - -ארץ O -" O -105 O -) O -, O -הגיע O -ל O -יד O -י O -רק O -ב O -ימים O -אלה O -. O -למרות B-SENT -ש O -חלפו O -מאז O -24 O -שנה O -, O - -ו O -למרות O -ש O -אירועי O -ימים O -אלה O -אולי O -מאפילים O -על O -כל O -דבר O -אחר O -, O -אני O -מרגיש O -חובה O -ב O -שמ O -ם O -של O -ה O - -חללים O -, O -ה O -פצועים O -ו O -לוחמי O -ה O -קרב O -ה O -זה O -, O -כ O -אחד O -ש O -היה O -שם O -, O -להתייחס O -ל O -מאמר O -. O - -קטעי B-SENT -ה O -מאמר O -ה O -עוסקים O -ב O -קרב O -עצמו O -, O -ו O -ש O -רק O -אלי O -אלי O -אתייחס O -, O -מאופיינים O -ב O -חוסר O -היגיון O -ו O - -ב O -סילוף O -עובדות O -, O -ו O -מעידים O -על O -ה O -כותב O -כי O -הוא O -משולל O -הבנה O -טקטית O -מינימלית O -ו O -ניסיון O -קרבי O -משמעותי O -. O -ו B-SENT - -כעת O -ל O -עובדות O -ה O -מצוטטות O -: O -כותרת O -ה O -מאמר O -, O -" O -ה O -קרב O -על O -סן O -סימון O -: O -היה O -לא O -היה O -" O - -, O -מרמזת O -כי O -ייתכן O -ש O -ב O -מנזר O -כלל O -לא O -התנהל O -קרב O -. O -ל B-SENT -כל O -אלה O -ה O -מתרשמים O -ו O -מסתפקים O -ב O -קריאת O - -ה O -כותרת O -בלבד O -, O -אני O -יכול O -להבטיח O -ש O -אכן O -התנהל O -שם O -קרב O -, O -ו O -הוא O -היה O -אחד O -ה O -קשים O -ו O -ה O - -מפוארים O -ב O -כל O -קרבות O -ישראל O -. O -ו B-SENT -על O -כך O -תעיד O -ה O -היסטוריה O -, O -ו O -יעידו O -עשרות O -ה O -לוחמים O -ש O -נשארו O -ב O - -חיים O -ו O -הצליחו O -לבצע O -את O -משימת O -ם O -על O -אף O -ה O -אבדות O -ה O -כבדות O -. O -ב B-SENT -מלחמת O -ה O -עצמאות O -, O -ב O -קרב O - -על O -ירושלים O -, O -נשארה O -שכונת O -קטמון O -תקועה O -ב O -לב O -ירושלים O -ה O -מערבית O -ו O -שימשה O -כ O -מרכז O -כוח O -ערבי O -רב O -ו O -מאורגן O - -. O -ה B-SENT -שטח O -ה O -חיוני O -של O -קטמון O -ל O -שליטה O -ב O -אזור O -מסוים O -או O -ל O -ניצחון O -ב O -קרב O -היה O -מנזר O -סן O -סימון O - -, O -ו O -לכן O -היה O -עלי O -עלי O -להשתלט O -עלי O -עלי O -. O -כתוב B-SENT -: O -" O -ב O -חוכמה O -עשה O -ל O -ך O -מלחמה O -" O -. O - -אכן B-SENT -, O -כך O -עשתה O -חטיבת O -" O -הראל O -" O -. O -היא B-SENT -כבשה O -תחילה O -את O -ה O -מנזר O -ו O -את O -התנגדות O -ה O -אויב O -שברה O - -לאחר O -מ O -כן O -, O -ב O -קרב O -מגננה O -ש O -התנהל O -בתוך O -ה O -מנזר O -( O -קרב O -מגננה O -מיועד O -ל O -שבירת O -כוחות O -חזקים O -ממ O - -ממ O -) O -. O -ה B-SENT -אויב O -אמנם O -נשבר O -; O -קרב O -ה O -מגננה O -הפך O -ל O -ניצחון O -מוחץ O -; O -קטמון O -נפלה O -ו O -ירושלים O -ה O - -מערבית O -אוחדה O -. O -ה B-SENT -מאמר O -, O -ה O -מצטט O -מ O -מחקר O -ו O -של O -אורי O -מילשטיין O -: O -" O -מלכתחילה O -לא O -היה O -צריך O -לכבוש O - -את O -ה O -מנזר O -... O -" O -, O -מצביע O -על O -חוסר O -הבנה O -אסטרטגית O -ו O -טקטית O -ב O -סוגיה O -זו O -. O -מורשת B-SENT -ה O -קרב O -ש O - -צה"ל O -אימץ O -כ O -תוצאה O -מ O -ה O -קרב O -על O -מנזר O -סן O -סימון O -איננה O -מיתוס O -. O -היא B-SENT -מבוססת O -על O -אותו O -קרב O -ה O -מגננה O - -ש O -ב O -ו O -אירעו O -מעשי O -גבורה O -עילאיים O -, O -ו O -על O -קבלת O -החלטות O -קשות O -ב O -תנאי O -קרב O -. O -חוסר B-SENT -ניסיון O -ו O -עלילה O - -זדונית O -באים O -ל O -ביטוי O -ב O -קטע O -ה O -מאמר O -ה O -דן O -ב O -נושא O -" O -רצח O -ה O -נזירות O -" O -. O -כל B-SENT -ישראלי O -ה O - -מנוסה O -ב O -קרב O -לילה O -ב O -שטח O -בנוי O -( O -כ O -מנזר O -סן O -סימון O -) O -יודע O -, O -כי O -תוך O -כדי O -פריצה O -ב O -חשכה O - -קודם O -כל O -יורים O -ב O -כל O -דבר O -ש O -נע O -מ O -ממול O -ממול O -. O -ו B-SENT -אכן O -, O -כך O -קרה O -ב O -פריצה O -ל O -מנזר O - -ה O -חשוך O -לחלוטין O -. O -לא B-SENT -נהרגו O -" O -נזירות O -" O -, O -אלא O -נשים O -ערביות O -, O -כנראה O -משרתות O -ב O -מנזר O -. O -חבל B-SENT -. O - -איש B-SENT -לא O -התכוון O -ל O -כך O -. O -ציטוט B-SENT -מ O -ה O -מאמר O -: O -" O -ב O -עת O -ש O -נורתה O -, O -( O -ה O -נזירה O -) O - -עסקה O -ב O -טיפול O -ב O -פצועים O -" O -. O -איזה B-SENT -פצועים O -? O -הרי B-SENT -כותב O -ה O -מאמר O -קובע O -, O -פסקה O -קודם O -ל O -כן O -, O - -כי O -ה O -מנזר O -היה O -ריק O -מ O -מגינים O -. O -אני B-SENT -הייתי O -שם O -, O -ו O -כל O -תיאור O -ה O -גווייה O -( O -" O -זה O -היה O - -מראה O -איום O -של O -דם O -ו O -בשר O -ו O -איברי O -מין O -" O -) O -הוא O -שקר O -ו O -כזב O -. O -לא B-SENT -היה O -כאן O -רצח O -נזירות O - -, O -אלא O -ניסיון O -ל O -רצח O -טוהר O -ה O -נשק O -של O -ה O -פלמ"ח O -ב O -טוהר O -עט O -ה O -סופרים O -של O -ה O -כותב O -. O -ה B-SENT - -ידיעה O -ש O -פורסמה O -תחת O -ה O -כותרת O -: O -" O -הורים O -ב O -בית O -ה O -ספר O -ב O -נווה O -- O -מונוסון O -נגד O -טיול O -ל O -ירושלים O - -; O -טוענים O -: O -הפגנה O -פוליטית O -" O -; O -( O -" O -ה O -ארץ O -" O -, O -0192 O -) O -, O -הטרידה O -אות O -י O -מאוד O -. O - -נילי B-SENT -מנדלר O -, O -ש O -מסרה O -את O -ה O -ידיעה O -מ O -פי O -הורים O -( O -אין O -יודעים O -אם O -מדובר O -ב O -הורה O -בודד O -, O -או O - -ב O -קבוצת O -הורים O -מסוימת O -) O -, O -כתבה O -כי O -" O -אחד O -ה O -הורים O -, O -ש O -החזיר O -ל O -מנהלת O -את O -ה O -ספח O -ה O - -חוזר O -ש O -שב O -שב O -הביע O -התנגדות O -ל O -השתתפות O -ילד O -ו O -ב O -טיול O -, O -טען O -כי O -" O -זהו O -חינוך O -פוליטי O -כאשר O -ילדי O - -נו O -משמשים O -אמצעי O -חי O -ל O -הפגנות O -" O -. O -לשם B-SENT -איזון O -ה O -נושא O -, O -על O -י O -לציין O -כי O -אני O -נמנה O -על O -ה O - -הורים O -ש O -הביעו O -את O -הסכמת O -ם O -ל O -קיום O -ה O -טיול O -. O -לא B-SENT -רק O -ש O -אין O -זה O -עניין O -פוליטי O -אלא O -זה O -נושא O - -חינוכי O -- O -לאומי O -מ O -מדרגה O -ראשונה O -. O -אני B-SENT -גם O -מוכן O -להסתכן O -ו O -לשער O -כי O -ה O -הורה O -, O -או O -קבוצת O -ה O -הורים O - -ש O -התנגדה O -ל O -טיול O -, O -מסתתרים O -תחת O -ה O -מעטה O -של O -טיעון O -פוליטי O -נבוב O -ו O -למעשה O -הם O -פוחדים O -לשלוח O -את O -ילדי O -הם O - -שמא O -איזה O -ערבי O -ינעץ O -ב O -ב O -סכין O -ב O -גב O -. O -אני B-SENT -מוכן O -, O -ו O -רוצה O -, O -לשלוח O -את O -ה O -ילד O -של O - -י O -ל O -ירושלים O -עם O -זר O -פרחים O -ו O -עם O -קופסת O -ממתקים O -, O -כדי O -להביע O -הוקרה O -ו O -עידוד O -ל O -נכד O -של O -אבא O -קובנר O - -, O -ש O -חטף O -סכין O -ב O -ראש O -מידי O -אותו O -מרצח O -שפל O -ש O -דקר O -ל O -מוות O -שלושה O -יהודים O -ב O -שכונת O -בקעה O -. O -אני B-SENT - -רוצה O -ש O -ה O -ילד O -של O -י O -יסתכל O -ב O -עיניים O -של O -ה O -ילד O -ה O -פגוע O -ו O -יגיד O -ל O -ל O -: O -" O -אתה O - -לא O -לבדך O -ב O -ירושלים O -, O -כי O -גם O -אנחנו O -ה O -גרים O -ב O -נווה O -- O -מונסון O -, O -אתך O -אתה O -ב O -לב O -איתן O -ביחד O - -" O -. O -אני B-SENT -מוכן O -להעיד O -כי O -גם O -אב O -י O -ז"ל O -, O -ש O -לקח O -אות O -אני O -ב O -אוטובוס O -ל O -ירושלים O -ב O -שנת O - -תרצ"ח O -כאשר O -ה O -ערבים O -ירו O -על O -כביש O -באב O -אל O -- O -ואד O -, O -אמר O -ל O -ל O -: O -" O -אלי O -! O -אם O -תשמע O - -יריות O -תתכופף O -, O -אבל O -אל O -תפחד O -" O -. O -ו B-SENT -לכן O -אני O -אומר O -ל O -כל O -ילדי O -ישראל O -: O -אל O -תפחדו O -! O -סעו O - -ל O -ירושלים O -, O -אפילו O -אם O -יורים O -או O -אם O -מתעופפים O -שם O -סכינים O -. O -ה B-SENT -פטיש O -ה O -כבד O -ש O -הנחית O -ה O -פועל O -ה O - -ערבי O -על O -ראשי O -הם O -של O -מעסיק O -ו O -ו O -חבר O -ו O -, O -ה O -סכין O -ש O -יד O -ו O -של O -איש O -ה O -מרכול O -תקעה O - -ב O -אחד O -ה O -ספקים O -, O -ו O -רצח O -ה O -מסעדן O -ב O -עין O -כרם O -זה O -לא O -מ O -כבר O -על O -- O -ידי O -אחד O -מ O - -עובדי O -ו O -, O -אף O -הוא O -ערבי O -, O -מעלה O -מ O -נבכי O -ה O -זיכרון O -את O -ה O -פתגם O -: O -" O -ה O -מגדל O -נחש O -שפיפון O - -ב O -בית O -ו O -, O -אל O -יופתע O -אם O -ימצא O -עצמו O -מוכש O -" O -. O -אני B-SENT -כותב O -זאת O -ב O -צער O -ו O -ב O -רגשי O -אשם O - -; O -פעם O -חשבתי O -אחרת O -( O -על O -ה O -ערבים O -, O -לא O -על O -ה O -נחשים O -) O -. O -מסתבר B-SENT -ש O -הייתי O -תמים O -. O -מאמר B-SENT - -ה O -מערכת O -ש O -התפרסם O -תחת O -ה O -כותרת O -" O -יחדל O -נא O -מילוא O -להתלבט O -" O -( O -" O -ה O -ארץ O -" O -, O -111 O -) O - -, O -תובע O -למצות O -את O -ה O -דין O -עם O -ה O -מפכ"ל O -ו O -קציני O -ה O -משטרה O -ה O -בכירים O -בגלל O -מחדלי O -הם O -ב O -אירועי O -הר O - -ה O -בית O -. O -זהו B-SENT -כישלונ O -ו O -ה O -יחיד O -עד O -כה O -של O -ה O -מפכ"ל O -, O -ש O -נגרם O -רק O -בגלל O -ה O -עובדה O -ש O - -הוא O -סמך O -יותר O -מדי O -על O -קציני O -ו O -ה O -בכירים O -. O -אם B-SENT -כל O -נושא O -משרה O -יפוטר O -מיד O -עם O -ה O -שגיאה O -ה O -ראשונה O - -, O -תסבול O -ה O -מדינה O -מ O -סבב O -בלתי O -פוסק O -של O -בעלי O -תפקידים O -בכירים O -, O -חסרי O -ניסיון O -ו O -חסרי O -ביטחון O -. O -מתקבל B-SENT -ה O - -רושם O -ש O -ה O -יחידים O -ב O -מדינה O -ה O -זו O -ש O -אינם O -צריכים O -לחשוש O -מ O -תוצאות O -מעשי O -הם O -הם O -ה O -פוליטיקאים O -. O -אלה B-SENT - -לא O -נופלים O -כ O -תוצאה O -מ O -שגיאות O -או O -מ O -מחדלים O -, O -או O -אפילו O -מ O -התנהגות O -פסולה O -, O -אלא O -רק O -בשל O -משחקי O -כוח O - -פוליטיים O -. O -המלצה B-SENT -ל O -פיטורי O -שר O -או O -נושא O -משרה O -פוליטית O -, O -מצד O -בית O -ה O -משפט O -ה O -עליון O -או O -מצד O -ועדת O -חקירה O - -נופלת O -תמיד O -על O -אוזניים O -ערלות O -; O -אבל O -כש O -אפשר O -להטיל O -את O -ה O -אשמה O -על O -קצין O -צבא O -או O -משטרה O -, O -מוטל O -גורל O - -ם O -של O -אלה O -על O -כף O -ה O -מאזניים O -. O -אני B-SENT -תמה O -על O -" O -ה O -ארץ O -" O -ש O -הזדרז O -לשחק O -ל O -ידי O -הם O - -של O -ה O -פוליטיקאים O -, O -ה O -מעוניינים O -להיפטר O -מ O -ה O -מפכ"ל O -ה O -נוכחי O -. O -ב B-SENT -עיתונים O -רבים O -פורסם O -מכתב O -ה O -של O -מלי O - -פיליפסבורן O -, O -יו"ר O -ארגון O -נפגעי O -ה O -משכנתאות O -ו O -חסרי O -ה O -דיור O -, O -תחת O -ה O -כותרת O -" O -אל O -תיגעו O -ב O -משכנתא O -" O - -. O -ב B-SENT -רצונ O -י O -להדגיש O -משפט O -אחד O -ב O -דברי O -ה O -: O -" O -אנו O -, O -נפגעי O -ה O -משכנתאות O -, O -נקלענו O -ל O -מצב O - -נו O -ה O -קשה O -לא O -בגלל O -ריבית O -נמוכה O -, O -אלא O -בגלל O -ה O -אינפלציה O -ו O -ה O -הצמדה O -" O -. O -מלים B-SENT -כ O -דורבנות O -, O - -ה O -תואמות O -במדויק O -את O -מה O -ש O -ה O -חתום O -מטה O -טען O -במשך O -שנים O -רבות O -. O -זו B-SENT -לא O -ה O -ריבית O -ש O -סיבכה O -את O - -מקבלי O -ה O -משכנתאות O -או O -לקוחות O -ה O -בנקים O -ה O -אחרים O -. O -יתר B-SENT -- O -על O -- O -כן O -, O -ה O -ריבית O -ב O -ישראל O -במיוחד O - -על O -משכנתאות O -היא O -מ O -ה O -נמוכות O -ביותר O -ב O -עולם O -, O -אם O -לא O -ה O -נמוכה O -ש O -ב O -הן O -. O -ו B-SENT -באשר O -ל O - -הצמדה O -ו O -ל O -אינפלציה O -ה O -כתובת O -אינה O -ה O -בנקים O -. O -ב B-SENT -16 O -ב O -אוקטובר O -הגיע O -אל O -ה O -עיר O -דה O -מוין O -, O - -ב O -מערב O -ה O -תיכון O -של O -ארצות O -ה O -ברית O -, O -ה O -נשיא O -גורג O -בוש O -. O -כאן B-SENT -הוא O -גם O -נתקל O -ב O -הפגנה O -ה O - -ראשונה O -נגד O -מלחמה O -ב O -מפרץ O -ה O -פרסי O -. O -" B-SENT -לא O -נשפוך O -דם O -למען O -נפט O -" O -, O -קראו O -לעבר O -ו O -כמה O -צעירים O - -ב O -אסיפת O -בחירות O -ל O -טובת O -מועמד O -רפובליקאי O -. O -" B-SENT -לא O -נפט O -" O -, O -כעס O -ה O -נשיא O -, O -" O -מדובר O -פה O -ב O - -תוקפנות O -גלויה O -" O -. O -איובה B-SENT -היא O -מקום O -לא O -- O -שגרתי O -. O -אומרים B-SENT -עלי O -ה O -, O -ש O -יש O -ב O -ה O -יותר O -ארגונים O - -פציפיסטיים O -ל O -קילומטר O -מרובע O -מאשר O -ב O -כל O -מדינה O -אחרת O -של O -ארה"ב O -. O -פציפיזם B-SENT -אינו O -מוגבל O -ב O -איובה O -ל O -סטודנטים O -רדיקליים O -, O - -אפשר O -למצוא O -אות O -ו O -גם O -בין O -חקלאים O -ו O -פועלי O -חרושת O -. O -איובה B-SENT -היא O -מ O -אסמי O -ה O -דגנים O -של O -ארה"ב O -, O -ו O - -ל O -פרנסת O -ה O -היא O -מייצאת O -אות O -הם O -ל O -מדינות O -כ O -ברית O -ה O -מועצות O -. O -ב B-SENT -0891 O -נקלעה O -איובה O -, O -יחד O -עם O - -מדינות O -חקלאיות O -אחרות O -ב O -מערב O -ה O -תיכון O -, O -ל O -סחרור O -קשה O -: O -ה O -נשיא O -גימי O -קרטר O -החליט O -להטיל O -חרם O -דגנים O -על O - -ה O -סובייטים O -, O -בגלל O -פלישת O -ם O -ל O -אפגניסטן O -. O -איובה B-SENT -שילמה O -את O -ה O -מחיר O -עד O -אמצע O -שנות O -ה O -80 O -. O -כלכלת B-SENT - -ה O -היתה O -שקועה O -ב O -מיתון O -קשה O -, O -ב O -שעה O -ש O -רוב O -ה O -מדינות O -ה O -אחרות O -ב O -ארה"ב O -נהנו O -מ O -שגשוג O -חסר O - -תקדים O -. O -לא B-SENT -רק O -מניעים O -אלטרואיסטיים O -מעוררים O -איפוא O -ב O -איובה O -התנגדות O -ל O -מלחמות O -, O -אלא O -גם O -מניעים O -תועלתניים O -. O -כאן B-SENT -אין O - -אוהבים O -ביותר O -את O -ה O -רעיון O -של O -מלחמה O -ב O -מפרץ O -ה O -פרסי O -. O -יום B-SENT -אחד O -ב O -שבוע O -ש O -עבר O -מרח O -ה O -עיתון O - -ה O -מקומי O -" O -דה O -מוין O -רגיסטר O -" O -כותרת O -ענקית O -ל O -רוחב O -מלוא O -ה O -עמוד O -ה O -ראשון O -של O -מוספ O -ו O -ה O -יומי O - -. O -" B-SENT -, O -אמרה O -ה O -כותרת O -ב O -אנגלית O -, O -" O -שלום O -עכשיו O -" O -. O -לצד B-SENT -ה O -אותיות O -ה O -אדומות O -הציב O -ה O - -עיתון O -את O -סמל O -תנועת O -ה O -התנגדות O -ל O -מלחמת O -וייטנאם O -. O -ב B-SENT -יום O -אחר O -פירסם O -ה O -עיתון O -ב O -עמוד O -מאמרי O -ה O -מערכת O - -של O -ו O -קריקטורה O -מקפיאת O -דם O -. O -נראה B-SENT -ב O -ה O -מלאך O -ה O -מוות O -חבוש O -כאפייה O -, O -ו O -מ O -פי O -ו O -בוקעות O -ה O - -מלים O -: O -" O -קרא O -את O -שפתיי O -, O -גורג O -לא O -תהיה O -עוד O -וייטנאם O -" O -. O -" B-SENT -גורג O -" O -הוא O -כמובן O -גורג O -בוש O - -. O -" B-SENT -קרא O -את O -שפתיי O -... O -" O -היא O -פאראפראזה O -על O -סיסמת O -ה O -בחירות O -ה O -כוזבת O -של O -ה O -נשיא O -מ O -8891 O -, O - -" O -קראו O -את O -שפתיי O -לא O -יהיו O -מסים O -חדשים O -" O -. O -ב B-SENT -מהלך O -נשף O -התרמה O -ל O -טום O -הארקין O -, O -ה O -סנאטור O -ה O - -דמוקרטי O -ש O -העמיד O -את O -עצמו O -השבוע O -ל O -בחירה O -חוזרת O -, O -הסביר O -ל O -י O -ב O -אריכות O -אחד O -ה O -נוכחים O -כיצד O -יכולה O -ארה"ב O - -ליישב O -את O -סכסוך O -ה O -מפרץ O -ה O -פרסי O -, O -" O -אם O -רק O -תביא O -את O -ה O -עניין O -לפני O -בית O -ה O -משפט O -ה O -בין O - -- O -לאומי O -" O -ב O -האג O -. O -התבוננתי B-SENT -ב O -בן O -שיח O -י O -ב O -הפתעה O -ניכרת O -. O -טיעונים B-SENT -כ O -אלה O -, O -ש O -פעם O - -היו O -מקובלים O -בין O -ליברלים O -ב O -ארה"ב O -, O -לא O -שמעתי O -כבר O -הרבה O -זמן O -. O -ב B-SENT -מעמקי O -ה O -מערב O -ה O -תיכון O -של O -ארה"ב O - -קל O -להאמין O -ש O -ה O -עולם O -נברא O -ב O -צלם O -אמריקה O -, O -ש O -הוא O -מלא O -אנשים O -הגיוניים O -ו O -פרגמטיים O -, O -ו O -ש O -ה O - -אותוריטה O -ה O -מוסרית O -של O -מוסד O -ש O -איש O -, O -כולל O -ארה"ב O -, O -מעולם O -לא O -לקח O -ב O -רצינות O -תעשה O -רושם O -מיידי O -על O -סדאם O - -חוסיין O -. O -ה B-SENT -משבר O -ב O -מפרץ O -ה O -פרסי O -לא O -היה O -נושא O -בחירות O -מרכזי O -ב O -איובה O -. O -טום B-SENT -הארקין O -ה O -ליברלי O -, O - -ו O -יריב O -ו O -ה O -רפובליקאי O -ה O -שמרן O -טום O -טקי O -, O -אמנם O -לא O -יכלו O -להסכים O -על O -שום O -עניין O -, O -אבל O -נטו O -להסכים O - -על O -ה O -מפרץ O -ה O -פרסי O -. O -שניהם B-SENT -תומכים O -ב O -נשיא O -, O -ו O -שניהם O -מאמינים O -ב O -צורך O -ב O -זהירות O -. O -ב B-SENT -ימי O - -ה O -מלחמה O -ה O -קרה O -היה O -טוקי O -אנטי O -- O -קומוניסט O -נלהב O -, O -ו O -הארקין O -היה O -יונה O -נלהבת O -. O -טוקי B-SENT -תמך O -ב O -כל O - -סעיף O -הוצאות O -צבאיות O -אפשרי O -, O -ו O -חייב O -את O -ה O -מאבק O -ל O -הפלת O -ממשלת O -ניקרגואה O -; O -הארקין O -רצה O -קיצוצים O -ניכרים O -ב O -תקציב O - -ה O -ביטחון O -, O -ו O -עמד O -ב O -ראש O -ה O -מאבק O -ב O -סנאט O -נגד O -ה O -סיוע O -ל O -קונטראס O -. O -ב B-SENT -מפרץ O -ה O -פרסי O - -שניהם O -נצים O -יוניים O -, O -או O -יונים O -נציות O -, O -ב O -מידה O -כמעט O -שווה O -. O -ב B-SENT -העדר O -קומוניזם O -יש O -פרגמטיזם O -ו O -אל O -פני O - -ה O -שטח O -חוזרת O -גם O -מידה O -הגונה O -של O -בדלנות O -אינסטינקטיווית O -, O -אפיון O -היסטורי O -של O -מדינות O -ה O -מערב O -ה O -תיכון O -ה O -אמריקאי O -. O - -ב B-SENT -מהלך O -מערכת O -ה O -בחירות O -הקרין O -מטה O -ו O -של O -טוקי O -תשדיר O -טלוויזיה O -, O -ש O -אחד O -מ O -סעיפי O -ו O -היה O -התנגדות O -ל O - -סיוע O -חוץ O -. O -ב B-SENT -עיני O -ה O -שדולה O -ה O -פרו O -- O -ישראלית O -ב O -וואשינגטון O -, O -התנגדות O -ל O -סיוע O -חוץ O -מעמידה O -את O -ה O - -מתנגד O -מ O -חוץ O -ל O -מחנה O -גם O -אם O -אינה O -מוסבת O -ספציפית O -על O -ישראל O -. O -פעילים B-SENT -פרו O -- O -ישראליים O -עודדו O -מאז O -ו O -מעולם O - -יהודים O -לתרום O -כספים O -ל O -מועמדים O -, O -ה O -נאבקים O -נגד O -מתנגדי O -סיוע O -ה O -חוץ O -. O -ממילא B-SENT -, O -טום O -הארקין O -, O -תומך O -עקיב O - -ב O -סיוע O -ה O -חוץ O -, O -היה O -אחד O -ה O -מקבלים O -ה O -גדולים O -ביותר O -של O -תרומות O -יהודיות O -ב O -שש O -ה O -שנים O -ה O -אחרונות O - -. O -לפי B-SENT -מקור O -יהודי O -ב O -איובה O -, O -הארקין O -קיבל O -002 O -אלף O -דולר O -מ O -וועדי O -פעולה O -פוליטיים O -פרו O -- O -ישראליים O -ב O -כל O - -רחבי O -ארה"ב O -. O -טוקי B-SENT -קיבל O -רק O -52 O -אלף O -. O -ב B-SENT -48 O -תיאר O -מנכ"ל O -אייפא"ק O -טום O -דאיין O -את O -הארקין O -כ O -אחד O -מ O - -ששה O -או O -שבעה O -סנאטורים O -, O -ש O -נבחרו O -בזכות O -" O -כסף O -יהודי O -" O -. O -זו B-SENT -היתה O -התרברבות O -לא O -- O -דיסקרטית O -, O -ו O - -אין O -ספק O -ש O -יימנע O -ממנ O -ממנ O -השנה O -. O -אף B-SENT -- O -על O -- O -פי O -- O -כן O -, O -שתדלנים O -פרו O -- O -ישראליים O -ב O - -וואשינגטון O -חככו O -השבוע O -את O -ידי O -הם O -ב O -שביעות O -רצון O -ל O -משמע O -ה O -ידיעה O -, O -ש O -הארקין O -גבר O -על O -טוקי O -ב O -הפרש O - -משכנע O -של O -10 O -% O -. O -הוא B-SENT -ה O -סנאטור O -ה O -דמוקרטי O -ה O -ראשון O -ב O -תולדות O -איובה O -ה O -נבחר O -ל O -תקופת O -כהונה O -שנייה O - -. O -הוא B-SENT -גם O -יודע O -, O -ש O -בלי O -תרומות O -מ O -חוץ O -ל O -גבולות O -מדינת O -ו O -ה O -קטנה O -, O -יתקשה O -לאסוף O -די O -כסף O - -ל O -ניהול O -מערכת O -ה O -בחירות O -ה O -טלוויזיונית O -ה O -יקרה O -, O -ש O -שבלעדי O -שבלעדי O -אין O -פוליטיקאים O -אמריקאיים O -מסוגלים O -לנצח O -. O -חזקה B-SENT -על O - -הארקין O -, O -ש O -יצביע O -גם O -ב O -עתיד O -בעד O -חוק O -סיוע O -ה O -חוץ O -. O -מזכירות B-SENT -איגוד O -ה O -פועלים O -ה O -חקלאיים O -הציעה O -אתמול O - -ש O -ה O -ממשלה O -תשלם O -מענק O -חודשי O -של O -500 O -ש"ח O -, O -נוסף O -ל O -שכר O -ה O -רגיל O -, O -ל O -כל O -ישראלי O -ש O -יעבוד O - -ב O -קטיף O -ה O -הדרים O -במשך O -שלושה O -חודשים O -לפחות O -. O -מזכיר B-SENT -איגוד O -ה O -פועלים O -ה O -חקלאיים O -, O -חיים O -אביבי O -, O -מסר O -ש O - -ה O -הצעה O -הועלתה O -לנוכח O -ה O -מחסור O -ב O -עובדי O -קטיף O -ה O -מורגש O -כבר O -כעת O -ו O -יוחרף O -ב O -חודש O -ה O -בא O -, O -כש O - -ה O -קטיף O -יהיה O -ב O -עיצומ O -ו O -, O -בעיקר O -עקב O -היעדרות O -עובדים O -מ O -ה O -שטחים O -. O -ה B-SENT -מעסיקים O -אינם O -מצפים O -ש O -יצליחו O - -למשוך O -מספר O -ניכר O -של O -עובדים O -ישראליים O -ל O -קטיף O -, O -בגלל O -ה O -שכר O -ה O -נמוך O -ה O -משולם O -ל O -עבודה O -זו O -מעט O -מעל O - -שכר O -ה O -מינימום O -ב O -משק O -. O -ב B-SENT -שבוע O -ה O -בא O -ידון O -מנכ"ל O -התאחדות O -ה O -איכרים O -, O -שלמה O -רייזמן O -, O -עם O -מנכ"ל O - -שירות O -ה O -תעסוקה O -, O -דוד O -מנע O -, O -ב O -דרישת O -ה O -התאחדות O -לשכנע O -עולים O -חדשים O -ה O -לומדים O -ב O -אולפנים O -לעבוד O -ב O -קטיף O - -. O -רייזמן B-SENT -מציע O -ש O -ה O -ממשלה O -תסבסד O -מחצית O -מ O -עלות O -העסקת O -ם O -של O -ה O -עולים O -ב O -קטיף O -. O -מזכירות B-SENT -איגוד O -ה O - -פועלים O -ה O -חקלאיים O -אישרה O -אתמול O -נקיטת O -עיצומים O -אם O -עד O -סוף O -ה O -שבוע O -ה O -בא O -לא O -יושג O -הסכם O -שכר O -ל O -שנתיים O -ה O - -באות O -, O -ב O -מסגרת O -ו O -יועלה O -שכר O -ה O -עובדים O -ב O -20 O -% O -. O -הסכם B-SENT -ה O -שכר O -פג O -ב O -אפריל O -. O -יו"ר B-SENT - -רשות O -שדות O -ה O -תעופה O -, O -אריה O -גרוסבורד O -, O -נהרג O -ב O -יום O -שני O -ב O -תאונת O -דרכים O -ב O -ארצות O -- O -ה O -ברית O -. O - -גרוסבורד B-SENT -נהג O -לבדו O -ב O -מכונית O -, O -ב O -דרכ O -ו O -מ O -ה O -עיר O -מיניאפוליס O -ב O -אינדיאנה O -ל O -נמל O -ה O -תעופה O -של O -ה O - -. O -משאית B-SENT -פגעה O -ב O -מכונית O -ו O -, O -ו O -הוא O -נהרג O -ב O -מקום O -. O -אריה B-SENT -גרוסבורד O -אמור O -היה O -לטוס O -מ O -מיניאפוליס O -ל O - -שיקאגו O -, O -ו O -מנ O -ה O -ל O -ניו O -- O -יורק O -. O -שם B-SENT -היה O -אמור O -להיפגש O -עם O -אשת O -ו O -, O -ש O -עשתה O -ב O - -בוסטון O -. O -בני B-SENT -ה O -זוג O -גרוסבורד O -תוכננו O -לשוב O -היום O -אחרי O -- O -ה O -צהריים O -ל O -ישראל O -. O -דבר B-SENT -מות O -ו O -של O -גרוסבורד O - -נודע O -רק O -אתמול O -ב O -צהריים O -ל O -רשות O -שדות O -ה O -תעופה O -. O -ה B-SENT -גורמים O -, O -ש O -טיפלו O -ב O -דבר O -ב O -ארץ O -, O - -לא O -קישרו O -את O -שמ O -ו O -עם O -תפקיד O -ו O -. O -משטרת B-SENT -מיניאפוליס O -מצאה O -ב O -מכונית O -ו O -את O -מזוודות O -יו O -עם O -תגי O -ה O - -זיהוי O -. O -ה B-SENT -משטרה O -דיווחה O -ל O -מחלקת O -ה O -מדינה O -ה O -אמריקאית O -, O -ש O -העבירה O -את O -ה O -מידע O -ל O -קונסוליה O -ה O -ישראלית O - -ב O -שיקאגו O -, O -ה O -מטפלת O -ב O -אזור O -מיניאפוליס O -. O -ה B-SENT -קונסוליה O -ב O -שיקאגו O -הבריקה O -ל O -ירושלים O -. O -מ B-SENT -משרד O -ה O -חוץ O - -ב O -ירושלים O -נשלח O -שליח O -להודיע O -על O -דבר O -ה O -מוות O -ל O -בני O -ה O -משפחה O -, O -אולם O -לא O -היה O -איש O -ב O -דירת O -ו O - -. O -רק B-SENT -אחר O -- O -כך O -נמצא O -כרטיס O -ביקור O -, O -ו O -ב O -ו O -שמ O -ו O -ו O -תואר O -ו O -של O -גרוסבורד O -כ O -מנהל O - -מפעל O -פרטי O -. O -אנשי B-SENT -משרד O -ה O -חוץ O -התקשרו O -אתמול O -ל O -מפעל O -, O -ו O -רק O -אז O -התברר O -כי O -מדובר O -ב O -יו"ר O -רשות O - -שדות O -ה O -תעופה O -. O -גרוסבורד B-SENT -, O -בן O -95 O -ב O -מות O -ו O -, O -השאיר O -אחרי O -אחרי O -אם O -, O -אחות O -, O -רעיה O -, O - -בן O -ו O -שתי O -בנות O -. O -עדיין B-SENT -לא O -נקבע O -מועד O -ל O -הלוויית O -ו O -. O -מרכז B-SENT -ה O -מידע O -ל O -זכויות O -ה O -אדם O -ב O - -שטחים O -, O -" O -בצלם O -" O -, O -מפרסם O -מ O -פעם O -ל O -פעם O -דפי O -מידע O -ו O -ב O -הם O -פרטים O -על O -ה O -נעשה O -ב O - -שטחים O -ב O -תחומים O -שונים O -. O -ה B-SENT -סגנון O -ענייני O -מאוד O -, O -ב O -דרך O -כלל O -יש O -ב O -ו O -נגיעה O -ל O -ענייני O -חוק O -. O - -" B-SENT -בצלם O -" O -נוהג O -להפיץ O -את O -דפי O -ה O -מידע O -של O -ו O -בין O -בעלי O -עניין O -שונים O -, O -חלק O -ם O -בעלי O -משרות O -ב O - -מערכת O -ה O -משפט O -. O -ב B-SENT -ימים O -אלה O -התקבל O -ב O -משרדי O -ה O -ארגון O -מכתב O -מ O -משרד O -ה O -משפטים O -, O -על O -נייר O -רשמי O - -, O -ו O -זה O -לשונ O -ו O -: O -" O -ה O -נדון O -ה O -חוברת O -של O -של O -. O -רצ"ב B-SENT -מוחזר O -אלי O -כם O -ה O -חומר O -ש O - -נשלח O -אל O -י O -. O -כמות B-SENT -ו O -אתם O -שולחים O -אל O -י O -מדי O -פעם O -ו O -הוא O -נזרק O -ישר O -ל O -פח O -. O -אבקשכם B-SENT -אבקש O - -אבקש O -לשלוח O -אל O -י O -חומר O -זה O -. O -מען B-SENT -עבודת O -י O -מיועד O -ל O -קבלת O -חומר O -עבודה O -בלבד O -" O -. O -על B-SENT -ה O -חתום O - -נגה O -ענתבי O -, O -ה O -ממונה O -על O -נוסח O -ה O -חוק O -ו O -רשומות O -. O -ו B-SENT -ל O -מי O -ש O -יספר O -פעם O -את O -סיפור O -ה O - -של O -שגרת O -ה O -חיים O -ב O -ימים O -ה O -מטורפים O -ה O -אלה O -, O -הנה O -הערת O -שוליים O -. O -על B-SENT -חשבונות O -ה O -מים O -ש O -עיריית O - -ירושלים O -שלחה O -ב O -ימים O -אלה O -ל O -תושבים O -הודפסה O -הנחיה O -ב O -זו O -ה O -לשון O -: O -" O -אם O -יש O -ל O -ל O -מיכל O -מים O - -על O -גג O -ה O -בית O -, O -אנא O -דאג O -ל O -נעילת O -ה O -כניסה O -אל O -ה O -גג O -, O -כדי O -למנוע O -מ O -אנשים O -זרים O -לזהם O - -או O -להרעיל O -את O -מי O -ה O -שתייה O -" O -. O -ה B-SENT -סתיו O -את O -נו O -. O -ה B-SENT -עלים O -מזהיבים O -. O -קצ B-SENT -ה O -של O -שנה O - -נוספת O -נראה O -במעורפל O -ב O -אופק O -, O -ו O -עדיין O -לא O -זכית O -ב O -מענק O -" O -גאונות O -" O -של O -קרן O -מקארתור O -. O -גם B-SENT -אני O - -לא O -. O -פרסום B-SENT -רשימת O -ה O -זוכים O -השנה O -, O -כמו O -תמיד O -, O -די O -ב O -ו O -כדי O -לגרום O -ל O -סופרים O -, O -ל O -אמנים O - -, O -ל O -אקדמאים O -ו O -ל O -פעילים O -ב O -מסגרות O -שונות O -ב O -רחבי O -ארצות O -ה O -ברית O -כול O -ה O -לצפות O -ל O -רשימת O -ה O -שנה O - -ה O -באה O -, O -כש O -ב O -לב O -ם O -מקננת O -ה O -תקווה O -ש O -אולי O -גם O -הם O -ייכללו O -ב O -ב O -. O -אחרי B-SENT -כ O -כלות O - -הכל O -, O -63 O -ה O -זוכים O -אם O -לשפוט O -על O -פי O -ה O -כתוב O -עלי O -הם O -ב O -עיתונות O -אינם O -נראים O -מיוחדים O -כל O -כך O -. O - -אבל B-SENT -כמובן O -, O -לאחר O -בדיקה O -קפדנית O -, O -מתברר O -ש O -זה O -בדיוק O -מה O -ש O -הם O -מיוחדים O -. O -קחו B-SENT -למשל O -את O -מריה O -ואראלה O - -. O -ב B-SENT -תחילה O -היא O -מצטיירת O -כמו O -כל O -מארגנת O -קהילתית O -אחרת O -. O -אבל B-SENT -ב O -מבט O -מ O -קרוב O -מתברר O -ש O -היא O -נמנתה O -עם O - -מקימי O -גנאדאס O -דל O -ואיה O -, O -קואופרטיב O -ל O -גידול O -כבשים O -ו O -ל O -אריגה O -ב O -לוס O -אוחוס O -, O -ניו O -- O -מקסיקו O -. O -האם B-SENT - -אתם O -יכולים O -להתעלות O -על O -כך O -? O -ו B-SENT -ישנם O -גם O -ה O -זוכים O -ה O -מיוחדים O -ב O -מובן O -ה O -יותר O -שנוי O -ב O -מחלוקת O -של O - -ה O -מלה O -. O -למרות B-SENT -ש O -מענקי O -קרן O -מקארתור O -נועדו O -לשחרר O -את O -ה O -מוכשרים O -מ O -נטל O -ה O -השתכרות O -ל O -מחיית O -ם O -, O - -כמה O -מענקים O -מגיעים O -תמיד O -ל O -אנשים O -ש O -אינם O -בדיוק O -נאבקים O -כדי O -לזכות O -ב O -הכרה O -ו O -ב O -חופש O -יצירה O -( O -ה O -דוגמה O - -ה O -עיקרית O -של O -ה O -שנה O -היא O -סוזן O -זונטג O -) O -. O -אם B-SENT -ב O -כל O -זאת O -אתם O -רוצים O -סבסוד O -ל O -מחשבותי O -כם O -ו O - -ל O -מעשי O -כם O -, O -עלי O -עלי O -להשיג O -אות O -ו O -ב O -ה O -דרך O -ה O -מיושנת O -: O -לפנות O -ל O -קרן O -, O -ו O -לשכנע O - -אנשים O -בעלי O -כסף O -ש O -אתם O -זכאים O -ל O -חלק O -ממנ O -ו O -. O -להלן B-SENT -כמה O -עצות O -. O -עצה B-SENT -מספר O -1 O -: O -הצטרפו O -ל O - -גופים O -. O -כמעט B-SENT -אף O -אחד O -מ O -שבעה O -מיליארדי O -ה O -דולרים O -ש O -קרנות O -אמריקאיות O -מחלקות O -מדי O -שנה O -אינו O -מיועד O -ל O -יחידים O -. O - -זו B-SENT -אחת O -ה O -סיבות O -ל O -כך O -ש O -מענקי O -מקארתור O -מבוקשים O -כל O -כך O -. O -ב B-SENT -דרך O -כלל O -מופנה O -ה O -כסף O -ל O -קולגים O - -, O -ל O -מכוני O -מחקר O -, O -( O -כולל O -צוותות O -חשיבה O -) O -, O -ל O -קבוצות O -ל O -שירות O -קהילתי O -, O -ל O -מוזיאונים O -, O -ל O - -ספריות O -ו O -ל O -ארגונים O -אחרים O -שלא O -ל O -מטרות O -רווח O -. O -משום B-SENT -כך O -, O -ראשית O -כל O -עלי O -ך O -לחשוב O -על O -קבלת O -תפקיד O - -חשוב O -ב O -אחד O -מ O -מאות O -ה O -גופים O -הללו O -. O -אז B-SENT -תוכל O -לתור O -אחר O -כסף O -ב O -אורח O -עצמאי O -פחות O -או O -יותר O -, O - -אפילו O -לייסד O -במשך O -ה O -זמן O -מכון O -מ O -ך O -ב O -מסגרת O -ה O -מכון O -נותן O -ה O -חסות O -, O -למרות O -ש O -נותן O -ה O -חסות O - -של O -של O -ירצה O -אחוז O -שמן O -מ O -ה O -מענק O -כדי O -לכסות O -" O -הוצאות O -קבועות O -" O -. O -אם B-SENT -אתה O -מעדיף O -לפעול O -בלי O -איש O - -ביניים O -, O -אתה O -יכול O -להקים O -ארגון O -מ O -של O -ך O -שלא O -ל O -מטרות O -רווח O -. O -כך B-SENT -נהג O -איל O -ה O -הון O -בעל O -ה O - -מודעות O -ה O -חברתית O -טד O -טרנר O -, O -כאשר O -מימן O -את O -" O -ה O -התאחדות O -ל O -עולם O -טוב O -יותר O -" O -, O -ה O -מבקשת O -מענקים O - -מ O -קרנות O -כדי O -להפיק O -סרטי O -טלוויזיה O -דוקומנטריים O -. O -לפעמים B-SENT -, O -אחרי O -ש O -קיבל O -מימון O -פטור O -מ O -מס O -באמצעות O -ה O -התאחדות O -של O - -ו O -שלא O -ל O -מטרות O -רווח O -מקרין O -טרנר O -את O -ה O -סרטים O -ה O -דוקומנטריים O -ב O -רשת O -ה O -טלוויזיה O -של O -ו O -ה O -קיימת O -בהחלט O - -כדי O -לשאת O -רווחים O -ו O -ב O -רשתות O -אחרות O -כמו O -ה O -. O -אבל B-SENT -אל O -תטרחו O -לדווח O -ל O -שלטונות O -ה O -מקומיים O -. O -ה B-SENT -גבול O - -בין O -" O -ש O -לא O -ל O -מטרות O -רווח O -" O -ו O -בין O -" O -ל O -מטרות O -רווח O -" O -מטושטש O -מאוד O -, O -ו O -ב O -דרך O - -כלל O -הכל O -חוקי O -למהדרין O -. O -ה B-SENT -דוגמה O -ה O -ידועה O -ביותר O -ל O -שמצה O -מ O -ה O -תקופה O -ה O -אחרונה O -היא O -ה O -שימוש O -ב O - -כספי O -קרן O -גאנט O -ל O -רכישת O -אלפי O -עותקים O -של O -ה O -אוטוביוגרפיה O -של O -יו"ר O -גאנט O -לשעבר O -, O -אלאן O -נויהארט O -. O -ה B-SENT -דבר O -סייע O - -להכניס O -את O -ה O -ספר O -ל O -רשימת O -רבי O -- O -ה O -מכר O -של O -" O -ניו O -יורק O -טיימס O -" O -, O -מעמד O -ש O -ב O -דרך O - -כלל O -גורר O -בעקבותי O -ו O -מכירת O -עשרות O -אלפי O -עותקים O -נוספים O -. O -עצה B-SENT -מספר O -2 O -. O -לך B-SENT -להיות O -זמר O -אופרה O -או O -ביולוג O -מולקולרי O - -. O -ב B-SENT -ספר O -" O -צדקה O -מתחילה O -מ O -בית O -" O -התלוננה O -תרזה O -אודנדל O -על O -מחזוריות O -ה O -פילנטרופיה O -. O -חלק B-SENT -נכבד O -מ O -תרומותי O - -הן O -של O -קרנות O -( O -ו O -של O -חברות O -ו O -אנשים O -עשירים O -) O -מגיע O -לבסוף O -ל O -נקודת O -ה O -התחלה O -של O -הן O -: O -ל O - -דרגים O -ה O -עליונים O -של O -ה O -חברה O -, O -ל O -מוזיאונים O -, O -ל O -אולמות O -קונצרטים O -ו O -ל O -תיאטראות O -קהילתיים O -, O -ש O -סוג O -ה O - -אנשים O -ה O -יכול O -להרשות O -ל O -עצמו O -לתרום O -תרומות O -צדקה O -הוא O -ה O -מנהל O -אות O -ם O -ו O -פוקד O -אות O -ם O -. O -51 B-SENT -% O - -מ O -כספי O -ה O -קרנות O -מופנים O -ל O -פעילויות O -תרבותיות O -. O -ל B-SENT -דעת O -אנשים O -ה O -עשויים O -לחשוב O -על O -קדימויות O -חברתיות O -נאצלות O -יותר O -מאשר O - -הקמת O -סביבה O -תומכת O -ל O -אופרה O -, O -זה O -שיעור O -גבוה O -מאוד O -. O -ל B-SENT -מרבה O -ה O -מזל O -, O -כ O -שליש O -מ O -ה O -כסף O - -מועבר O -ל O -פרויקטים O -ש O -ה O -תועלת O -ב O -הם O -קונקרטית O -יותר O -, O -כגון O -שירותי O -בריאות O -או O -מחקרים O -ב O -תחומי O -ה O -רפואה O -או O - -מדעי O -ה O -טבע O -. O -קצת B-SENT -יותר O -מ O -שליש O -מושקע O -ב O -שטח O -ה O -הפקר O -ה O -גדול O -ש O -ב O -ו O -מתחרים O -על O -תשומת O - -ה O -לב O -רעיונות O -ו O -תוכניות O -ב O -מדעי O -ה O -חברה O -ו O -ב O -תחום O -ה O -מדיניות O -ה O -ציבורית O -. O -ב B-SENT -תוככי O -אזור O -זה O - -, O -ש O -בירת O -ו O -היא O -ואשינגטון O -, O -ה O -עצות O -ה O -באות O -הן O -ה O -ישימות O -ביותר O -. O -עצה B-SENT -מספר O -3 O -. O -הייה B-SENT - -מודע O -ל O -השלכות O -ה O -מרובות O -של O -עבודת O -ך O -. O -" B-SENT -אתה O -מתבונן O -ב O -נושא O -ש O -יהיה O -בעל O -ערך O -לגיטימי O -ל O -מחקר O - -, O -ו O -ב O -ו O -ב O -זמן O -יכבוש O -את O -דמיונ O -ו O -של O -פקיד O -ה O -תוכנית O -" O -, O -אומר O -וורן O -מילר O -, O -איש O - -מדעי O -ה O -מדינה O -ב O -אוניברסיטת O -אריזונה O -. O -במשך B-SENT -עשרות O -שנים O -שימש O -מילר O -מגייס O -מענקים O -מרכזי O -ל O -פרויקט O -חקר O -ה O -בחירות O -ה O - -ארציות O -, O -מחקר O -על O -התנהגות O -מצביעים O -ש O -בעקבותי O -ו O -נכתב O -ב O -1960 O -ה O -חיבור O -ה O -קלאסי O -" O -ה O -מצביע O -ה O -אמריקאי O - -" O -. O -מאז B-SENT -המשיך O -ב O -מחקר O -ו O -הגדיל O -את O -מאגר O -ה O -נתונים O -של O -של O -. O -עצה B-SENT -מספר O -4 O -. O -תבטיח B-SENT -להציל O - -את O -ה O -עולם O -. O -על B-SENT -פי O -כל O -ה O -דיווחים O -חלה O -ב O -עשורים O -ה O -אחרונים O -ירידה O -תלולה O -ב O -תמיכה O -ב O -עבודות O -מחקר O - -ב O -מדעי O -ה O -חברה O -ש O -אין O -ל O -ל O -יישום O -מעשי O -מידי O -. O -מילר B-SENT -זוכר O -ש O -ב O -שנות O -ה O -05 O -ו O -ה O - -06 O -הוא O -יכול O -היה O -להסתמך O -על O -סיוע O -מ O -קרנות O -פורד O -ו O -רוקפלר O -. O -ניסיונ B-SENT -ו O -ה O -אחרון O -של O -מילר O -להשיג O -כספים O - -מ O -קרן O -פורד O -היה O -ב O -תחילת O -שנות O -ה O -07 O -. O -אז B-SENT -הוא O -גילה O -עניין O -ב O -גורמים O -ה O -משפיעים O -על O -שיעור O -ה O - -מצביעים O -, O -ו O -פקיד O -ה O -תוכנית O -גילה O -עניין O -ב O -שיעור O -ה O -מצביעים O -ה O -נמוך O -בקרב O -בני O -מיעוטים O -. O -נראה B-SENT -היה O -ש O - -זה O -זיווג O -טוב O -. O -מילר B-SENT -הדגיש O -ש O -יעד O -ה O -מחקר O -יהיה O -איתור O -מחסומים O -ה O -מונעים O -מ O -בני O -מיעוטים O -להצביע O -, O -דבר O - -ש O -ל O -דעת O -ו O -עשוי O -היה O -להיות O -צעד O -ראשון O -לקראת O -העלאת O -שיעור O -ה O -הצבעה O -. O -אך B-SENT -זה O -לא O -היה O -מספיק O -טוב O - -ל O -קרן O -. O -קרן B-SENT -פורד O -רצתה O -להיות O -בטוחה O -ש O -ה O -מחקר O -יגדיר O -במפורש O -את O -ה O -דרכים O -ל O -העלאת O -שיעור O -ה O -מצביעים O - -בקרב O -מיעוטים O -. O -מילר B-SENT -לא O -יכול O -היה O -להבטיח O -זאת O -ב O -מצפון O -נקי O -. O -מ B-SENT -אז O -7791 O -מומן O -פרויקט O -חקר O -ה O -בחירות O - -ה O -ארציות O -על O -- O -ידי O -קרן O -ה O -מדע O -ה O -ארצית O -, O -ה O -תומכת O -ה O -גדולה O -ביותר O -ב O -מדעי O -חברה O -בסיסיים O -. O - -עצה B-SENT -מספר O -5 O -. O -הייה B-SENT -קטליזטור O -ל O -שינוי O -. O -יש B-SENT -אנשים O -ה O -מתקשים O -להבטיח O -ב O -פרצוף O -פוקר O -ש O -הם O -יציעו O -פתרון O - -ל O -בעיה O -חברתית O -מרכזית O -. O -ל B-SENT -אנשים O -אלה O -מומלץ O -ליהפך O -ל O -קטליזטורים O -: O -לקבץ O -יחד O -אנשים O -רבים O -ה O -יכולים O -להבטיח O -פתרונות O - -או O -ש O -לפחות O -יכולים O -היו O -, O -אילו O -סגרו O -אות O -ם O -ב O -מרכז O -ועידות O -במשך O -ימים O -, O -להציע O -פתרון O -. O -ה B-SENT -קרנות O - -אוהבות O -יותר O -ו O -יותר O -סימנים O -מוחשיים O -כ O -אלה O -ל O -השפעה O -ש O -יש O -ל O -כנסים O -ו O -ל O -התוועדויות O -. O -בעיקר B-SENT -הן O -אוהבות O - -לאסוף O -אנשים O -ש O -אילולא O -כן O -אפשר O -ש O -לא O -היו O -נפגשים O -לעולם O -: O -אקדמאים O -ו O -פוליטיקאים O -, O -ביורוקרטים O -ו O -נבחרי O -ציבור O -, O - -פנאטים O -של O -ה O -שמאל O -ו O -פנאטים O -של O -ה O -ימין O -. O -ו B-SENT -זכרו O -: O -אפשר O -ש O -פקיד O -ה O -תוכנית O -של O -ה O -קרן O - -יצטרף O -גם O -הוא O -ל O -מסע O -; O -הקפידו O -איפוא O -ב O -בחירת O -ה O -אתר O -. O -אומרים B-SENT -ש O -מרכז O -ה O -וועידות O -ו O -ה O -לימודים O - -של O -קרן O -רוקפלר O -ב O -בלאגיו O -ש O -ב O -איטליה O -הוא O -מקום O -טוב O -ל O -חישול O -קונסנזוס O -על O -בעיות O -דחופות O -של O -ה O -עולם O -. O - -גם B-SENT -אם O -לא O -ייווצר O -קונסנסוס O -חדש O -מ O -אלכימיה O -זו O -, O -קרוב O -ל O -וודאי O -ש O -ייצא O -כרך O -של O -ניירות O -. O -בקרב B-SENT -אקדמאים O - -ש O -אינם O -מוזמנים O -ל O -וועידות O -אלה O -שוררת O -ציניות O -רבה O -באשר O -ל O -מטר O -כרכי O -ה O -מאמרים O -ו O -ה O -ניירות O -ה O -מונפקים O -ב O - -חסות O -קרנות O -. O -אין B-SENT -זה O -סביר O -ש O -על O -שולחנות O -עיתונאים O -ו O -מעצבי O -מדיניות O -רובצים O -כל O -כרכי O -" O -אמריקן O -אקונומיק O -ריוויו O -" O - -של O -שלוש O -ה O -שנים O -ה O -אחרונות O -, O -אך O -בהחלט O -הגיוני O -ש O -יש O -ל O -הם O -כמה O -כרכים O -ה O -מספקים O -מידע O -ב O -נושא O - -נתון O -ב O -מינונים O -קטנים O -ו O -חזקים O -. O -עצה B-SENT -מספר O -6 O -. O -ערוך B-SENT -בדיקה O -מחודשת O -, O -נוקבת O -ו O -קטלנית O -, O -ל O -דוגמות O - -ליברליות O -עייפות O -. O -זה B-SENT -כ O -20 O -שנה O -מתאמצת O -חבורת O -שמרנים O -, O -ביני O -הם O -אירווינג O -קריסטול O -, O -להשיג O -סובסידיה O -פילנטרופית O -ל O -הוגים O - -רבים O -יותר O -מ O -ה O -ימין O -. O -הם B-SENT -סבורים O -ש O -יש O -אירוניה O -ב O -כך O -ש O -מענקי O -קרנות O -באים O -ב O -דרך O -כלל O -מ O - -טיפוסים O -שמרניים O -, O -אך O -ה O -קרן O -לובשת O -לעיתים O -קרובות O -חזות O -של O -שמאלה O -מ O -ה O -מרכז O -ב O -הנהגת O -ה O -פילנטרופואידים O -( O -ה O - -מונח O -ש O -טבע O -דווייט O -מקדונלד O -ב O -קרן O -פורד O -) O -, O -ש O -ב O -סופ O -ו O -של O -דבר O -מנהלים O -אות O -אות O -. O -קריסטול B-SENT - -ו O -חברי O -ו O -ה O -שמרנים O -עשו O -שני O -דברים O -: O -ראשית O -, O -הם O -עודדו O -חברות O -לתעל O -את O -ה O -פילנטרופיה O -של O -הן O -ל O - -יוצאים O -מן O -ה O -כלל O -של O -כלל O -זה O -, O -כמו O -קרן O -גון O -מ O -. O -שנית B-SENT -, O -הם O -עודדו O -את O -ה O -יוצאים O -מן O - -ה O -כלל O -להפעיל O -השפעה O -מודעת O -יותר O -על O -עיצוב O -מדיניות O -ב O -דרג O -ארצי O -. O -אנשים B-SENT -ב O -מרכז O -אותה O -תנועה O -מדגישים O -כי O -לא O - -היתה O -זו O -מזימה O -של O -ה O -ימין O -. O -" B-SENT -איש O -לא O -אמר O -הבה O -ניצור O -אווירה O -ש O -ב O -ה O -נוכל O -לבחור O -נשיא O -שמרן O - -" O -, O -זוכר O -לזלי O -לנקובסקי O -, O -אחד O -ה O -שותפים O -ל O -מזימה O -. O -לנקובסקי B-SENT -, O -ש O -ניהל O -את O -קרן O -סמית O -- O -ריצרדסון O - -ו O -כיום O -מנהל O -את O -מכון O -הדסון O -, O -אומר O -כי O -ה O -רעיון O -לא O -היה O -כל O -- O -כך O -לתת O -מענקים O -ל O -חשיבה O -ה O - -ימנית O -כש O -ל O -עצמה O -, O -אלא O -לתגמל O -חלופות O -ל O -כל O -ה O -חשיבה O -ה O -שמאלית O -ש O -קיבלה O -עד O -אז O -תגמולים O -ו O -מימון O - -. O -הוא B-SENT -ו O -קריסטול O -ו O -ה O -אחרים O -פשוט O -חשו O -ש O -" O -קיים O -מגוון O -שלם O -של O -נושאים O -ה O -ראויים O -ל O -בדיקה O -" O - -. O -הם B-SENT -פשוט O -תהו O -, O -נזכר O -לנקובסקי O -, O -" O -ל O -מי O -יש O -רעיונות O -מעניינים O -? O -" O -. O -ובכן B-SENT -, O -התברר O -ש O - -בעלי O -ה O -רעיונות O -ה O -מעניינים O -היו O -אנשים O -כמו O -רוברט O -בורק O -, O -מרטין O -פלדסטיין O -, O -צארלס O -מאריי O -ו O -אלן O -בלום O -ש O -גם O - -הודות O -ל O -תמיכה O -ה O -פילנטרופית O -ש O -קיבלו O -ב O -שנות O -ה O -70 O -ו O -או O -ב O -ראשית O -שנות O -ה O -80 O -אין O -צורך O -להציגם O - -את O -הם O -ב O -ציבור O -. O -אם B-SENT -אתה O -משמיע O -באוזני O -ה O -שמרנים O -רמז O -ל O -כך O -ש O -קריסטול O -ו O -שות O -הפכו O -את O -ה O - -קערה O -על O -פי O -ה O -, O -ו O -לכן O -כיום O -כסף O -ימני O -הוא O -ה O -חולש O -ב O -כיפה O -ב O -עולם O -ה O -רעיונות O -, O -הם O - -יפנו O -אות O -ך O -ל O -רשימת O -ה O -קרנות O -ה O -פילנטרופיות O -ה O -גדולות O -ביותר O -, O -ו O -יזכירו O -ל O -ך O -ש O -מבין O -ה O -קרנות O - -ה O -שמרניות O -אולין O -, O -סמית O -- O -ריצרדסון O -, O -לינד O -ו O -הרי O -בראדלו O -, O -פיו O -, O -שרה O -סקאיף O -רק O -פיו O -כלולה O -ב O - -רשימת O -ה O -גדולות O -. O -בהחלט B-SENT -נכון O -. O -ו B-SENT -לא O -זו O -בלבד O -, O -אלא O -נכון O -גם O -ש O -קרן O -מקארתור O -, O -ה O -דוגמה O - -ה O -שכיחה O -ביותר O -ל O -ליברליזם O -ה O -מבולבל O -של O -שנות O -ה O -60 O -, O -אכן O -ראויה O -ל O -תיאור O -זה O -. O -לפני B-SENT -כמה O -שנים O - -תרמה O -ה O -קרן O -82000 O -דולר O -ל O -אוניברסיטת O -קיימברידג O -ל O -ניסיונ O -ו O -של O -סטיוון O -הוקינג O -לאחד O -את O -תיאוריית O -ה O -קוונטים O -ו O -תיאוריית O - -ה O -יחסות O -של O -איינשטיין O -. O -כיום B-SENT -היתה O -תיאוריה O -כ O -זו O -ב O -חזקת O -מציאה O -גם O -ב O -מחיר O -כפול O -. O -אבל B-SENT -כמה O -כסף O - -העניקה O -קרן O -מקארתור O -ל O -פיסיקאים O -מבריקים O -ה O -מתעמקים O -ב O -בעיה O -זו O -ו O -אינם O -סובלים O -מ O -שיתוק O -טרגי O -בעקבות O -מחלה O -מנוונת O -חסרת O - -רחמים O -? O -עם B-SENT -זאת O -, O -גם O -ה O -קרנות O -בעלות O -ה O -שמות O -ה O -גדולים O -אינן O -בהכרח O -מעיינות O -ה O -חשיבה O -ה O -ליברלית O -ה O - -סטריאוטיפית O -כפי O -ש O -ל O -עתים O -קרובות O -מתארים O -אות O -ן O -. O -קרן B-SENT -רוקפלר O -אמנם O -מוכנה O -לחלק O -פה O -ו O -שם O -52000 O -דולר O -ל O - -לימוד O -" O -פמיניזם O -ו O -תיאוריה O -מוסרית O -: O -מבוא O -ל O -אתיקה O -ל O -חברות O -פוסט O -- O -תעשייתיות O -" O -. O -אבל B-SENT -אתה O -יכול O -לעיין O - -ב O -עשרות O -דפי O -דו"ח O -רוקפלר O -ה O -שנתי O -בלי O -למצוא O -פריט O -הוצאה O -ש O -קל O -כל O -כך O -ללעוג O -ל O -ו O -. O -אשר B-SENT -ל O - -קרן O -פורד O -שמרנים O -מתלוננים O -ש O -מדיניות O -העדפת O -שחורים O -כ O -קיזוז O -ל O -אפליית O -ה O -עבר O -( O -עצה O -מספר O -7 O -, O -היה O -שחור O - -) O -ש O -היא O -נוקטת O -, O -היא O -שריד O -ל O -ליברליזם O -קדום O -. O -ו B-SENT -אין O -ספק O -ש O -חלק O -נכבד O -מבין O -50 O -אלף O -ה O - -צקים O -ש O -עלי O -הם O -חותמת O -ה O -קרן O -נמסר O -ל O -קבוצות O -כמו O -" O -פראפראז O -" O -, O -כדי O -" O -לשרטט O -את O -מסלול O -ה O - -מורשת O -ה O -שחורה O -ב O -איסט O -סייד O -ה O -תחתית O -של O -ניו O -- O -יורק O -" O -. O -עם B-SENT -זאת O -, O -" O -טובת O -ה O -כלל O - -" O -, O -ה O -דו"ח O -ה O -גדול O -על O -סעד O -ש O -פרסמה O -קרן O -פורד O -ב O -9891 O -, O -סיפק O -תמיכה O -מנומקת O -( O -ב O -גיבוי O - -קונסנזוס O -רחב O -להפתיע O -של O -שמות O -מפורסמים O -) O -ל O -תוכנית O -עבודה O -ו O -סעד O -נוקשה O -ו O -יקרה O -למדי O -, O -ש O -אמנם O -הבטיחה O -מקומות O - -עבודה O -, O -אבל O -גם O -קבעה O -ש O -כל O -אדם O -ייזרק O -מ O -רשימות O -מקבלי O -ה O -סעד O -כעבור O -שנתיים O -. O -נוסף B-SENT -על O -כל O -זה O - -, O -אין O -זה O -משנה O -עד O -כמה O -גדול O -מספר O -ן O -של O -ה O -קרנות O -ב O -מרכז O -ו O -מעט O -שמאלה O -ממנ O -ממנ O -. O -כאשר B-SENT - -מגיעים O -ה O -דברים O -ל O -מדיניות O -ציבורית O -, O -כסף O -שמרן O -מקבל O -יותר O -תמורת O -ה O -דולר O -של O -ו O -. O -די B-SENT -לתת O -מבט O -ב O - -צוותות O -ה O -חשיבה O -, O -ה O -צינורות O -ה O -עיקריים O -בין O -ה O -קרנות O -לבין O -ה O -דיון O -ה O -ציבורי O -. O -קרנות B-SENT -שמרניות O -נמנעו O -ב O - -תבונה O -מ O -להגביל O -את O -ה O -תמיכה O -ה O -פיננסית O -של O -הן O -ל O -צוותות O -חשיבה O -שמרניים O -. O -בעוד B-SENT -ה O -פורדים O -ו O -ה O -רוקפלרים O - -הפכו O -ל O -מקורות O -פחות O -מהימנים O -של O -תמיכה O -ב O -מחקר O -, O -נשאו O -תמיד O -מוסדות O -כמו O -ברוקינגס O -את O -עיני O -הם O -ל O -קרנות O -שמרניות O - -כדי O -למלא O -את O -ה O -חסר O -. O -עצה B-SENT -מספר O -8 O -. O -הייה B-SENT -שבשבת O -רוח O -של O -רוח O -ה O -תקופה O -. O -כמו B-SENT -כל O -דבר O - -אחר O -, O -נושאים O -ה O -זוכים O -ל O -סבסוד O -נכבד O -באים O -ו O -הולכים O -. O -ב B-SENT -סוף O -שנות O -ה O -07 O -הרבו O -למשל O -לדבר O -ב O - -תחום O -ה O -יחסים O -ה O -בין O -- O -לאומיים O -על O -" O -ה O -דיאלוג O -בין O -צפון O -ו O -דרום O -" O -ו O -על O -" O -תלות O -הדדית O - -" O -, O -ו O -הרבה O -כסף O -הוקדש O -ל O -הגות O -על O -מדינות O -ה O -עולם O -ה O -שלישי O -. O -אז B-SENT -באו O -ה O -פלישה O -ל O -אפגניסטן O - -ו O -רונלד O -רייגן O -ו O -נאום O -אימפריית O -ה O -רשע O -. O -לפתע B-SENT -פתאום O -לא O -היה O -ה O -נושא O -ה O -חם O -ב O -יחסים O -ה O -בין O - -- O -לאומיים O -תלות O -הדדית O -גלובלית O -, O -אלא O -ביטחון O -לאומי O -. O -ה B-SENT -קרנות O -ה O -לא O -- O -שמרניות O -היו O -בעיקר O -אלה O -ש O -פרסמו O - -מחקרים O -ל O -אין O -- O -ספור O -ב O -שאלה O -כיצד O -לשמור O -על O -ה O -שלום O -. O -עכשיו B-SENT -, O -כש O -ה O -מלחמה O -ה O -קרה O -חלפה O - -, O -מנסים O -כולם O -למצוא O -משפטים O -ו O -מטבעות O -לשון O -חדשים O -ש O -ילכדו O -את O -תשומת O -לב O -ם O -של O -פקידי O -תוכניות O -. O -עצה B-SENT -מספר O - -9 O -. O -גייס B-SENT -ו O -הפקע O -ל O -רשותך O -טרמינולוגיה O -קיימת O -. O -נראה B-SENT -ש O -יעד O -ה O -משחק O -הוא O -לחטוף O -" O -ביטחון O -לאומי O -" O - -, O -נושא O -רב O -- O -שנתי O -עמיד O -, O -ו O -לנווט O -אות O -ו O -ל O -תחום O -ה O -מומחיות O -של O -ך O -. O -בעלי B-SENT -ה O -מודעות O - -ה O -סביבתית O -ה O -כלל O -- O -עולמית O -מדגישים O -ש O -כעת O -אי O -- O -אפשר O -כמעט O -להבדיל O -בין O -ביטחון O -לאומי O -לבין O -" O -ביטחון O -כדור O - -ה O -ארץ O -" O -. O -ה B-SENT -טיפוסים O -לובשי O -ה O -חליפות O -ה O -כהות O -, O -ה O -שייכים O -ל O -זרם O -מרכזי O -יותר O -, O -אומרים O -ש O - -שום O -מדינה O -אינה O -יכולה O -להיות O -בטוחה O -אם O -לא O -יהיה O -" O -מבנה O -ביטחון O -אירופי O -יציב O -" O -. O -ו B-SENT -כלכלנים O -ב O -חבורת O -מקדמי O - -ה O -מדיניות O -ה O -תעשייתית O -, O -יחד O -עם O -מחנכים O -ב O -חבורת O -ה O -חינוך O -ל O -מצוינות O -, O -ניסו O -ב O -עורמה O -לשים O -יד O -ם O - -על O -כספי O -מדיניות O -ה O -חוץ O -, O -ב O -טענה O -ש O -ב O -עתיד O -יוגדר O -ה O -ביטחון O -ה O -לאומי O -ב O -מונחים O -של O -כושר O -תחרות O - -כלכלי O -, O -לא O -של O -עוצמה O -צבאית O -. O -לגבי B-SENT -אנשים O -אלה O -סדאם O -חוסיין O -הוא O -מטרד O -. O -אבל B-SENT -סדאם O -חוסיין O -היה O -מתת O -- O - -שמים O -ל O -כל O -ה O -מומחים O -ל O -פיקוח O -רב O -- O -צדדי O -על O -ה O -נשק O -, O -ה O -טוענים O -זה O -שנים O -ש O -יש O -להפסיק O - -את O -הזרמת O -ו O -של O -נשק O -מתוחכם O -ל O -דיקטטורים O -רברבנים O -ב O -עולם O -ה O -שלישי O -. O -מדובר B-SENT -ב O -קומץ O -אנשים O -ש O -ניסו O -למשוך O - -אלי O -הם O -כספי O -מענקים O -ב O -עת O -ש O -ה O -עולם O -המטיר O -דולרים O -על O -אסטרטגיה O -גרעינית O -ו O -פיקוח O -רב O -- O -צדדי O -על O -ה O - -נשק O -. O -עובדה B-SENT -זו O -שימשה O -יסוד O -ל O -תלונה O -שכיחה O -ב O -תחום O -ה O -פילנטרופיה O -: O -התמקדות O -ם O -של O -קרנות O -ו O -צוותות O -חשיבה O - -ב O -מגמות O -אופנתיות O -גורמת O -ל O -כך O -ש O -ה O -חברה O -בקושי O -ערוכה O -ל O -גלים O -היסטוריים O -חדשים O -. O -גיימס B-SENT -אלן O -סמית O -, O -מחבר O - -ה O -ספר O -" O -ה O -ברוקרים O -של O -ה O -רעיונות O -: O -צוותות O -חשיבה O -ו O -עליית O -ה O -של O -עלית O -מדיניות O -חדשה O -" O -, O -אומר O - -כי O -ה O -התמקדות O -ב O -בעיות O -מדיניות O -דוחקות O -, O -ו O -דעיכת O -ה O -אמונה O -ב O -ערכ O -ו O -של O -מחקר O -בסיסי O -ב O -מדעי O -ה O - -חברה O -, O -הותירו O -אות O -נו O -ללא O -מאגר O -נרחב O -של O -מומחים O -, O -ש O -שממנ O -שממנ O -ניתן O -לשאוב O -כאשר O -יתעורר O -ה O -צורך O -. O - -זה B-SENT -נכון O -. O -אחרי B-SENT -הכל O -, O -כמה O -מחקרים O -ב O -חסות O -קרנות O -הוקדשו O -ב O -תקופת O -ה O -אובססיה O -ה O -ממושכת O -ל O -מלחמה O -ה O - -קרה O -ל O -ניסיון O -לגלות O -ב O -איזו O -מידה O -אדם O -כמו O -סדאם O -חוסיין O -עלול O -להיות O -מסוכן O -? O -כמה B-SENT -מחקרים O -שאלו O -כיצד O -להפוך O -כלכלה O - -סטליניסטית O -ל O -כלכלת O -שוק O -חופשי O -? O -יש B-SENT -סימוכין O -ל O -דעה O -ש O -כיום O -אלו O -הן O -שתי O -ה O -בעיות O -ה O -דוחקות O -ביותר O -של O - -ה O -עולם O -. O -אך B-SENT -אפשר O -לומר O -ב O -ביטחון O -, O -ש O -אף O -לא O -דולר O -אחד O -מבין O -למעלה O -מ O -05 O -מיליארד O -ה O -דולרים O - -ש O -הוציאו O -קרנות O -בין O -9791 O -ל O -9891 O -לא O -הוקדש O -ל O -הבהרת O -ן O -. O -האם B-SENT -ה O -כסף O -דוחף O -את O -ה O -רעיונות O -או O - -שמא O -זוכים O -ה O -רעיונות O -ל O -תמיכה O -כספית O -בזכות O -ערכ O -ם O -ו O -ה O -חידוש O -ש O -ב O -הם O -? O -כדי B-SENT -לענות O -על O -שאלות O - -אלו O -עלי O -ך O -להשתקע O -קודם O -ל O -כל O -ב O -אי O -- O -אלו O -הרהורים O -קוסמיים O -על O -טבע O -ה O -של O -ה O -היסטוריה O -, O -ו O - -להקדיש O -מחשבה O -ל O -דיאלקטיקה O -הגליאנית O -, O -ל O -תיאוריות O -על O -מקומ O -ו O -של O -ה O -אדם O -ה O -גדול O -ב O -ה O -היסטוריה O -ו O -כך O - -הלאה O -. O -ו B-SENT -זה O -אינו O -מבצע O -מ O -ה O -סוג O -ש O -יש O -לגשת O -אלי O -ו O -ללא O -מימון O -נכבד O -. O -ה B-SENT -יכולת O -לחזות O - -תוצאות O -בחירות O -ב O -ארצות O -ה O -ברית O -באמצעות O -סקרי O -דעת O -קהל O -עשויה O -להצטמצם O -ב O -שנים O -ה O -באות O -עד O -ל O -מינימום O -. O -כישלונות B-SENT - -חיזוי O -כבר O -נראו O -ב O -כמה O -מ O -מירוצי O -ה O -בחירות O -של O -ה O -שבוע O -. O -ה B-SENT -אשמה O -אינה O -ב O -שיטה O -או O -ב O -מדגם O - -, O -אלא O -ב O -שתי O -עובדות O -: O -( O -1 O -) O -אמריקאים O -אינם O -מתעניינים O -ביותר O -ב O -ה O -תהליך O -ה O -אלקטורלי O -, O -ו O -גם O - -אלה O -ה O -יודעים O -בעד O -מי O -היו O -רוצים O -להצביע O -אינם O -יודעים O -עד O -ה O -רגע O -ה O -אחרון O -אם O -יטרחו O -להצביע O -. O -רק B-SENT -35 O - -% O -מ O -הם O -הצביעו O -ב O -יום O -ג O -ב O -בחירות O -ל O -קונגרס O -ו O -ל O -מושלי O -ה O -מדינות O -. O -( B-SENT -2 O -) O -תחנת O - -ה O -קלפי O -שוב O -אינה O -מקום O -ה O -הצבעה O -ה O -הכרחי O -ה O -יחיד O -. O -ב B-SENT -קליפורניה O -הצביעו O -השנה O -לא O -פחות O -מ O -20 O -% O - -מ O -בעלי O -זכות O -ה O -בחירה O -ב O -דואר O -, O -לא O -מפני O -ש O -היו O -ב O -ארץ O -אחרת O -, O -או O -ב O -יבשת O -אחרת O -, O - -אלא O -מפני O -ש O -העדיפו O -את O -ה O -נוחות O -ה O -כרוכה O -ב O -מילוי O -טופסי O -ה O -הצבעה O -בין O -כותלי O -בית O -ם O -. O -ייתכן B-SENT -מאוד O - -ש O -ב O -הצבעה O -ב O -קלפי O -היתה O -יד O -ה O -של O -ה O -מועמדת O -ה O -דמוקרטית O -ל O -מושל O -, O -דיאן O -פיינסטיין O -, O -על O -ה O - -עליונה O -. O -כנראה B-SENT -רק O -בזכות O -ה O -הצבעה O -ב O -דואר O -הצליח O -לבסוף O -ה O -סנאטור O -פיט O -ווילסון O -לנצח O -, O -ב O -הפרש O -קטן O -. O - -משימת B-SENT -ם O -של O -עורכי O -סקרים O -ב O -שנים O -ה O -באות O -תהיה O -לברר O -תחילה O -האם O -ה O -נשאל O -מתכוון O -להצביע O -, O -ו O -באם O -אין O - -הוא O -מתכוון O -להצביע O -ה O -ייתכן O -ש O -ב O -כל O -זאת O -יצביע O -באמצעות O -ה O -דואר O -. O -ל B-SENT -ד"ר O -גון O -סילר O -, O -נשיא O -אוניברסיטת O - -בוסטון O -ש O -התמודד O -על O -כהונת O -מושל O -מסצוסטס O -, O -היתה O -תיאוריה O -שלמה O -על O -אי O -- O -ה O -התאמה O -של O -סקרי O -דעת O -קהל O -ל O - -מציאות O -ה O -אלקטורלית O -. O -הוא B-SENT -היה O -קורבן O -של O -אי O -- O -התאמה O -כ O -זאת O -. O -ה B-SENT -סקרים O -חזו O -ל O -ו O -תבוסה O -ניצחת O - -ב O -בחירות O -ה O -מוקדמות O -של O -ה O -מפלגה O -ה O -דמוקרטית O -, O -ב O -חודש O -ספטמבר O -. O -הוא B-SENT -הדהים O -את O -מפלגת O -ו O -, O -את O - -מדינת O -ו O -ו O -את O -ארה"ב O -ב O -ניצחון O -. O -ב B-SENT -מהלך O -מערכת O -ה O -בחירות O -ה O -כללית O -העלה O -סילבר O -את O -ה O -הסבר O -ה O - -בא O -ל O -אי O -- O -ה O -התאמות O -. O -" B-SENT -אמריקאים O -" O -, O -אמר O -, O -" O -מתרעמים O -על O -עצם O -ה O -שאלה O -איך O -תצביע O - -. O -סוף B-SENT -סוף O -, O -כל O -עניין O -ה O -הצבעה O -ה O -חופשית O -הוא O -פונקציה O -של O -סודיות O -ה O -. O -כאשר B-SENT -מטלפן O -אלי O -הם O -מישהו O - -, O -ש O -אין O -הם O -מכירים O -, O -ו O -שואל O -, O -איך O -תצביעו O -, O -אין O -הם O -בטוחים O -לעולם O -מיהו O -, O -ו O -מדוע O -הוא O - -שואל O -. O -אולי B-SENT -הוא O -ה O -שכן O -? O -אולי B-SENT -הוא O -ידיד O -, O -ה O -מנסה O -להכשילם O -להכשיל O -להכשיל O -? O -אולי B-SENT -הוא O -ה O -בוס O - -, O -ה O -מתכוון O -לעשות O -טיהור O -פוליטי O -ב O -משרד O -? O -ו B-SENT -אז O -הם O -מתרגזים O -ו O -משקרים O -" O -. O -ב B-SENT -יום O -ש O -שמעתי O - -את O -ה O -תיאוריה O -ה O -זו O -מ O -פי O -סילבר O -פרסם O -ה O -עיתון O -" O -בוסטון O -גלוב O -" O -סקר O -, O -ש O -הראה O -לראשונה O -יתרון O - -ל O -סילבר O -על O -פני O -יריב O -ו O -ה O -רפובליקאי O -, O -ויליאם O -וולד O -. O -כעבור B-SENT -שבוע O -התרחב O -ה O -יתרון O -עד O -9 O -% O -, O - -ו O -ב O -מסצוסטס O -היו O -אנשים O -משוכנעים O -ב O -ניצחונ O -ו O -של O -סילבר O -. O -הוא B-SENT -נוצח O -השבוע O -ב O -הפרש O -של O -שני O -אחוזים O -. O - -סילבר B-SENT -היה O -אולי O -ה O -דמות O -ה O -פוליטית O -ה O -מרתקת O -, O -ה O -מורכבת O -ו O -ה O -מסוכנת O -ביותר O -ב O -מערכת O -ה O -בחירות O -של O - -0991 O -. O -הוא B-SENT -אינטלקטואל O -רודני O -, O -ש O -התייחס O -ב O -בוז O -מתנשא O -ל O -כל O -שאלה O -ש O -נשאל O -, O -ו O -ל O -כל O -הערה O - -של O -יריב O -פוליטי O -. O -הוא B-SENT -השמיע O -הכרזות O -שערורייתיות O -ב O -גנות O -מיעוטים O -אתניים O -ו O -נגד O -קבוצות O -אוכלוסייה O -חלשות O -. O -הוא B-SENT -גידף O -ב O - -רבים O -את O -יריב O -ו O -( O -" O -בן O -כלבה O -, O -תוקע O -סכין O -ב O -גב O -" O -, O -אמר O -עלי O -ו O -ב O -ראיון O -עיתונאי O - -) O -. O -הוא B-SENT -צווח O -על O -עיתונאים O -מפורסמים O -ב O -רשתות O -טלוויזיה O -ארציות O -. O -לעומת B-SENT -זאת O -הוא O -גם O -השמיע O -מסר O -פוליטי O -רב O -- O - -תוכן O -, O -ש O -ב O -ו O -הדגיש O -יותר O -מ O -כל O -עניין O -אחר O -את O -משבר O -ה O -חינוך O -ב O -אמריקה O -, O -ו O -הציע O -חלופות O - -רציניות O -. O -הוא B-SENT -שכנע O -הרבה O -מ O -שומעי O -ו O -ש O -יש O -ל O -הוא O -ה O -מיומנות O -ה O -נחוצה O -להיות O -מושל O -. O -אבל B-SENT -מזג O - -ו O -ו O -הליכות O -יו O -הטילו O -פחד O -על O -סביבות O -יו O -. O -אם B-SENT -על O -פרנקלין O -דלאנו O -רוזוולט O -אמרו O -ש O -יש O -ל O -ו O -" O - -מוח O -מ O -מדרגה O -שנייה O -ו O -טמפרמנט O -מ O -מדרגה O -ראשונה O -" O -, O -הנה O -על O -סילבר O -היה O -אפשר O -לומר O -בדיוק O -את O -ה O -היפך O - -. O -ב B-SENT -32 O -ב O -אוקטובר O -התפעלה O -ממנ O -ממנ O -בעלת O -טור O -ב O -עיתון O -" O -בוסטון O -גלוב O -" O -ב O -מלים O -ה O -יאות O -ל O - -מעריצה O -בת O -21 O -: O -" O -הוא O -עשה O -ב O -חודשים O -אחדים O -למען O -צחות O -ה O -דיבור O -מה O -ש O -לקח O -ל O -חברה O -שנים O -כדי O - -לעשות O -למען O -טלוויזיה O -צבעונית O -... O -אם O -דיבור O -היה O -ספורט O -אולימפי O -, O -הוא O -היה O -זוכה O -ב O -מדליית O -ה O -זהב O -... O -סילבר O -כה O - -טוב O -, O -עד O -ש O -הוא O -גורם O -ל O -אנגלית O -להישמע O -כמו O -צרפתית O -... O -אם O -ייבחר O -, O -תהיה O -ל O -כולנו O -הם O -ה O -הזדמנות O - -ללמוד O -ממנ O -ממנ O -להיות O -סטודנטים O -ב O -כיתת O -ו O -ה O -ענקית O -, O -ה O -נקראת O -מסצוסטס O -" O -. O -אבל B-SENT -בדיוק O -ה O -רעיון O -ה O - -זה O -ה O -צורך O -להיות O -ארבע O -שנים O -ארוכות O -סטודנטים O -ב O -כיתת O -ו O -של O -פרופסור O -יהיר O -שלח O -מספר O -מפתיע O -של O -אנשי O -שמאל O -ל O - -זרועות O -יו O -של O -ה O -רפובליקן O -ה O -שמרן O -ויליאם O -וולד O -. O -הוא B-SENT -לבבי O -, O -חייכן O -, O -מסוגל O -להתבדח O -על O -חשבון O -עצמו O -, O - -ו O -לוקח O -את O -ה O -חיים O -קצת O -פחות O -ב O -רצינות O -. O -אילו B-SENT -ניצח O -סילבר O -, O -יתכן O -מאוד O -ש O -היה O -מנסה O -בעוד O -שנתיים O - -, O -או O -בעוד O -שש O -שנים O -, O -להתמודד O -על O -ה O -נשיאות O -. O -אלי B-SENT -ויזל O -, O -פרופסור O -ב O -אוניברסיטת O -בוסטון O -, O -ש O -סילבר O - -התאמץ O -הרבה O -למען O -זכיית O -ו O -ב O -פרס O -נובל O -ל O -שלום O -, O -תמך O -בגלוי O -ב O -מועמדות O -ו O -ל O -משרת O -ה O -מושל O -. O - -הוא B-SENT -היה O -אפילו O -מסוגל O -לראות O -ו O -מכהן O -ב O -בית O -ה O -לבן O -. O -" B-SENT -מדוע O -לא O -? O -" O -, O -שאל O -. O -" B-SENT - -גם O -על O -רונלד O -רייגן O -איש O -לא O -היה O -מאמין O -" O -. O -ה B-SENT -השוואה O -ה O -זו O -, O -על O -כל O -תוצאות O -יה O -, O -תיחסך O - -מאת O -מאת O -. O -סילבר B-SENT -חזר O -השבוע O -ל O -אוניברסיטה O -, O -כועס O -ו O -מתוסכל O -יותר O -מ O -אי O -- O -פעם O -. O -ה B-SENT -מפלגות O -ה O - -פוליטיות O -ב O -ארה"ב O -מתחילות O -כבר O -את O -הכנותי O -הן O -לקראת O -ה O -סיבוב O -ה O -בא O -, O -ב O -2991 O -. O -אז B-SENT -ייבחר O -לא O -רק O - -ה O -קונגרס O -, O -אלא O -גם O -ה O -נשיא O -. O -רק B-SENT -ארבע O -פעמים O -ב O -521 O -ה O -שנה O -ה O -אחרונות O -לא O -הצליח O -נשיא O -לחזור O - -ו O -להיבחר O -, O -אם O -רצה O -. O -עד B-SENT -לפני O -חודשיים O -לא O -היתה O -סיבה O -טובה O -להניח O -ש O -גורג O -בוש O -יהיה O -ה O -מקרה O -ה O - -חמישי O -. O -אבל B-SENT -מ O -אז O -נכשל O -בוש O -ב O -טיפול O -ו O -ב O -פרשת O -ה O -תקציב O -, O -הסתבך O -ב O -התכתשויות O -- O -סרק O -עם O - -ה O -דמוקרטים O -, O -ו O -נכנע O -לבסוף O -לאורך O -כל O -ה O -דרך O -, O -שוררת O -בין O -דמוקרטים O -ה O -תחושה O -ש O -יש O -ל O -הם O -סיכוי O - -ממשי O -להוציא O -את O -בוש O -מן O -ה O -בית O -ה O -לבן O -בעוד O -שנתיים O -. O -יש B-SENT -לא O -מעט O -רפובליקאים O -ה O -נוטים O -להסכים O -את O -ם O - -. O -עכשיו B-SENT -נבחנים O -ה O -מועמדים O -ל O -1992 O -. O -ה B-SENT -בחירות O -של O -ה O -שבוע O -יכלו O -להעניק O -דחיפה O -נאה O -ל O -שניים O -מן O -ה O - -פוליטיקאים O -ה O -מוזכרים O -ביותר O -: O -ה O -מושל O -רב O -- O -ה O -כריזמה O -של O -מדינת O -ניו O -יורק O -, O -מריו O -קואומו O -, O -ש O -שמ O - -ו O -מתרוצץ O -ב O -הקשרים O -נשיאותיים O -מ O -אז O -4891 O -; O -ו O -ה O -סנאטור O -מ O -ניו O -גרסי O -, O -ה O -אינטלקטואל O -ו O -כוכב O -ה O - -כדורסל O -ה O -מהולל O -של O -שנות O -ה O -06 O -, O -ביל O -בראדלי O -. O -ניצחונות B-SENT -שניהם O -לא O -הועמדו O -אף O -דקה O -אחת O -ב O -ספק O -ב O - -מהלך O -מערכת O -ה O -בחירות O -. O -ה B-SENT -שאלה O -ה O -יחידה O -היתה O -ב O -איזה O -הפרש O -ירסקו O -יריבים O -רפובליקאיים O -בני O -בלי O -שם O -. O -בראדלי B-SENT - -ניצח O -לבסוף O -ב O -הפרש O -של O -שלושה O -אחוזים O -. O -ב B-SENT -מוצאי O -יום O -ה O -בחירות O -התבוננו O -רפובליקאים O -ב O -אי O -- O -אמון O -ב O -תוצאות O - -, O -ו O -אמרו O -ב O -אנחה O -עמוקה O -כי O -אילו O -היו O -משכילים O -לעמוד O -על O -גודל O -ה O -הזדמנות O -ב O -ניו O -גרסי O -, O -היו O -מתגייסים O - -ל O -עזרת O -ה O -מועמדת O -ה O -רפובליקאית O -, O -כריסטין O -טוד O -ויטמן O -. O -מאחר B-SENT -ש O -הכול O -ראו O -ב O -ה O -שיה O -מובלת O -ל O -טבח O - -, O -היא O -נשארה O -עם O -תקציב O -בחירות O -קטן O -ב O -90 O -% O -מ O -זה O -של O -בראדלי O -. O -מריו B-SENT -קואומו O -היה O -ה O -מועמד O -ה O - -טבעי O -ביותר O -ל O -נשיאות O -מ O -אז O -נשא O -נאום O -מלהיב O -באוזני O -ועידת O -ה O -מפלגה O -ה O -דמוקרטית O -, O -ב O -קיץ O -4891 O -. O -כשרון B-SENT - -ה O -דיבור O -של O -ו O -, O -ה O -עומק O -ה O -אינטלקטואלי O -, O -ה O -נעימות O -ו O -ה O -חמימות O -שכנעו O -אנשים O -רבים O -ב O -ארה"ב O -ש O - -הוא O -יהיה O -יום O -אחד O -ה O -תשובה O -ה O -דמוקרטית O -ל O -רונלד O -רייגן O -. O -ה B-SENT -עובדה O -ש O -נחל O -ניצחון O -כל O -כך O -לא O -מרשים O - -על O -יריבים O -כל O -כך O -לא O -מרשימים O -אינה O -אומרת O -בהכרח O -ש O -סיכוי O -ו O -להתמודד O -על O -ה O -בית O -ה O -לבן O -אבדו O -לנצח O -. O - -היא B-SENT -אומרת O -לעומת O -זאת O -, O -ש O -משהו O -מן O -ה O -קסם O -של O -ו O -אבד O -. O -על B-SENT -סנאטורים O -אומרים O -כי O -אין O -ל O -ך O - -אחד O -מ O -הם O -ש O -אינו O -רואה O -את O -עצמו O -, O -כך O -או O -אחרת O -, O -מועמד O -פוטנציאלי O -ל O -נשיאות O -. O -ב B-SENT -מאה O -ה O - -זו O -נבחרו O -שני O -סנאטורים O -ל O -נשיאים O -, O -ו O -שלושה O -נשיאים O -אחרים O -כיהנו O -ב O -סנאט O -ב O -זמן O -כלשהו O -של O -ה O -קריירה O -של O - -של O -. O -לעומת B-SENT -זאת O -הגיעו O -ל O -נשיאות O -חמישה O -מושלי O -מדינות O -. O -כהונת B-SENT -מושל O -ב O -מדינה O -גדולה O -היא O -קרש O -קפיצה O -אל O -מרכז O - -תשומת O -ה O -לב O -ה O -לאומית O -. O -ב B-SENT -ארץ O -ה O -מתעניינת O -קודם O -כל O -ב O -יכולת O -ה O -ביצוע O -, O -כהונת O -ה O -נשיא O -נתפסת O - -לעתים O -קרובות O -ב O -אותו O -אופן O -ש O -נתפסת O -כהונת O -מנכ"ל O -תעשיית O -ה O -מכוניות O -קרייזלר O -. O -ה B-SENT -בחירות O -ה O -אלה O -מתנהלות O -על O -מיומנות O - -, O -לא O -על O -אידיאולוגיה O -" O -, O -הכריז O -חגיגית O -מייקל O -דוקאקיס O -, O -כאשר O -נבחר O -ל O -מועמד O -ה O -דמוקרטי O -ל O -נשיאות O -. O -הצלחת B-SENT - -ה O -ה O -כלכלית O -של O -מדינת O -ו O -, O -מסצוסטס O -, O -היתה O -צריכה O -להיות O -ה O -אישור O -ל O -דרגת O -מיומנות O -ו O -. O -ב B-SENT -סופ O - -ו O -של O -דבר O -התברר O -ש O -ה O -הצלחה O -היתה O -כישלון O -, O -ו O -דוקאקיס O -הובס O -. O -אבל B-SENT -ה O -נוסחה O -בעינה O -עומדת O -. O -לכן B-SENT - -יעוררו O -מידה O -גדולה O -של O -סקרנות O -ה O -מושלים O -ה O -חדשים O -של O -קליפורניה O -, O -של O -טקסס O -, O -של O -פלורידה O -, O -של O -אילינוי O -ו O - -של O -מישיגן O -. O -מ B-SENT -כל O -ה O -מושלים O -ה O -אלה O -בולטת O -אן O -ריצארדס O -. O -ב B-SENT -גיל O -85 O -היא O -רשאית O -לטעון O -ל O -תואר O - -ה O -פוליטיקאית O -ה O -בכירה O -ביותר O -ב O -ארה"ב O -. O -היא B-SENT -נבחרה O -השבוע O -ל O -מושלת O -טקסס O -, O -מ O -קץ O -מערכת O -ה O -בחירות O -ה O - -מכוערת O -ו O -ה O -דרמטית O -ביותר O -של O -0991 O -. O -ריצארדס B-SENT -הגיעה O -לראשונה O -אל O -ה O -תודעה O -ה O -לאומית O -ב O -ארה"ב O -, O -כאשר O -הוזמנה O - -לשאת O -את O -ה O -נאום O -ה O -מרכזי O -ב O -וועידת O -ה O -מפלגה O -ה O -דמוקרטית O -, O -ערב O -ה O -בחירות O -ל O -נשיאות O -של O -8891 O -. O - -ה B-SENT -דמוקרטים O -היו O -ב O -אווירת O -אופוריה O -, O -ו O -הניחו O -ש O -מועמד O -ם O -דוקאקיס O -לא O -יצטרך O -אפילו O -לרוץ O -ל O -בית O -ה O -לבן O - -. O -הוא B-SENT -יוכל O -ללכת O -ל O -שם O -ב O -נחת O -, O -בגלל O -ה O -בוז O -ו O -ה O -לעג O -ש O -עורר O -ה O -יריב O -ה O -רפובליקאי O - -גורג O -בוש O -. O -ריצארדס B-SENT -הצחיקה O -מיליוני O -אמריקאים O -ב O -נאום O -שזור O -עקיצות O -לגלגניות O -על O -בוש O -. O -" B-SENT -גורג O -ה O -מסכן O -" O -, O - -אמרה O -ריצארדס O -ב O -הגיה O -ה O -טקסאנית O -ה O -רכה O -של O -של O -, O -" O -הוא O -לא O -אשם O -, O -הוא O -נולד O -עם O -רגל O -כסף O - -ב O -פה O -" O -. O -זה B-SENT -היה O -רמז O -הן O -ל O -היותו O -בן O -טובים O -, O -ו O -הן O -ל O -יכולת O -ו O -יוצאת O -ה O -דופן O - -להגיד O -את O -ה O -דברים O -ה O -לא O -- O -נכונים O -ב O -זמן O -ה O -לא O -- O -נכון O -ו O -ב O -ה O -מקום O -ה O -לא O -- O - -נכון O -. O -בוש B-SENT -טען O -ל O -ימים O -כי O -ה O -התקפה O -ה O -היא O -נסכה O -ב O -ו O -את O -ה O -נחישות O -להשיב O -ל O -דמוקרטים O -ב O - -מתקפת O -נגד O -, O -ש O -זכר O -תוקפנות O -ה O -מעורר O -עד O -עכשיו O -אי O -- O -נוחות O -ניכרת O -בין O -משקיפים O -פוליטים O -. O -מאז B-SENT -היתה O -ריצארדס O - -דמות O -שנויה O -ב O -מחלוקת O -. O -כאשר B-SENT -החליטה O -להתמודד O -על O -מועמדות O -מפלגת O -ה O -ל O -מושל O -, O -התייצבו O -נגד O -נגד O -שני O -דמוקרטים O -אחרים O - -, O -ש O -הפיצו O -ידיעות O -כי O -היא O -השתמשה O -ב O -עבר O -ב O -סמים O -. O -ריצארדס B-SENT -לא O -טמנה O -את O -יד O -ה O -ב O -צלחת O -. O - -היא B-SENT -אמנם O -יצאה O -מן O -ה O -מערכה O -ה O -דמוקרטית O -ב O -ניצחון O -, O -אבל O -גם O -ב O -שן O -ו O -עין O -. O -איש B-SENT -ב O -טקסס O - -לא O -פיקפק O -ש O -יריב O -ה O -ה O -רפובליקאי O -, O -קלייטון O -ויליאמס O -, O -חוואי O -ו O -איש O -נפט O -, O -יביס O -אות O -היא O -ב O -קלות O - -. O -הוא B-SENT -הופיע O -ב O -תשדירי O -ה O -בחירות O -של O -ו O -רכוב O -על O -סוס O -, O -עם O -מגבעת O -רחבת O -תיתורת O -, O -ו O -פרט O -על O - -נימי O -ה O -מאציזמו O -ה O -טקסני O -. O -הוא B-SENT -גם O -התחיל O -למעוד O -מעידות O -מילוליות O -שערורייתיות O -. O -הוא B-SENT -יעץ O -ל O -נשים O -, O -ש O -אם O - -הן O -נופלות O -קרבן O -ל O -אונס O -ו O -הן O -חסרות O -אונים O -, O -" O -מוטב O -לשתוק O -ו O -ליהנות O -" O -. O -הוא B-SENT -דיבר O -ב O -פומבי O - -על O -ניסיונ O -ו O -כ O -איש O -צעיר O -ב O -בתי O -- O -בושת O -מקסיקאיים O -. O -הוא B-SENT -סירב O -ללחוץ O -את O -יד O -ה O -של O -ריצארדס O -ב O - -רבים O -, O -ו O -קרא O -ל O -ה O -ב O -פני O -ה O -שקרנית O -" O -. O -פעם B-SENT -אחרת O -שאל O -ב O -חיוך O -גדול O -, O -האם O -חזרה O - -להיות O -אלכוהוליסטית O -( O -ריצארדס O -הודתה O -ש O -עברה O -טיפול O -נגד O -אלכוהוליזם O -ב O -תקופה O -מסוימת O -של O -חיי O -ה O -) O -. O -יתרונ B-SENT -ו O -ה O - -עצום O -של O -ויליאמס O -ב O -סקרים O -פחת O -ו O -הלך O -, O -ו O -ה O -רפובליקאים O -מרטו O -ב O -ייאוש O -את O -שערותי O -הם O -. O -הם B-SENT -התחננו O - -לפני O -ויליאמס O -לשתוק O -. O -הוא B-SENT -לא O -הצליח O -. O -ב B-SENT -ינואר O -תקבל O -ל O -ידי O -ה O -ריצארדס O -את O -ה O -אחריות O -ה O -מינהלית O -ה O - -גדולה O -ביותר O -ש O -נמסרה O -אי O -- O -פעם O -לידי O -אשה O -ב O -תולדות O -ארה"ב O -. O -אשה B-SENT -אחרת O -, O -דיאן O -פיינסטיין O -, O -ראש O -ה O - -עיר O -לשעבר O -של O -סן O -פרנסיסקו O -, O -חצי O -- O -יהודיה O -, O -נוצחה O -ב O -הפרש O -קטן O -ב O -בחירות O -ל O -מושל O -קליפורניה O -, O -ה O - -גדולה O -ב O -מדינות O -ה O -ברית O -. O -אשה B-SENT -אחרת O -, O -קיי O -אור O -, O -איבדה O -את O -כהונת O -ה O -מושלת O -ב O -מדינת O -נבראסקה O -מ O - -קץ O -תקופת O -כהונה O -אחת O -. O -אשה B-SENT -שלישית O -, O -מדלן O -קיונין O -, O -יהודייה O -מלאה O -, O -סיימה O -שש O -שנות O -כהונה O -כ O -מושלת O -ורמונט O - -, O -ו O -לא O -העמידה O -את O -עצמה O -ל O -בחירה O -חוזרת O -. O -נשים B-SENT -הפתיעו O -, O -ו O -נבחרו O -ל O -מושלות O -ב O -מדינת O -קנזאס O -ו O - -ב O -מדינת O -אורגון O -. O -ב B-SENT -סך O -הכול O -יהיו O -איפוא O -ב O -ינואר O -ה O -בא O -שלוש O -מושלות O -ב O -ארה"ב O -ו O -שתי O -סנאטוריות O -. O - -אף B-SENT -אשה O -אחת O -לא O -הצליחה O -להיבחר O -השנה O -ל O -סנאט O -. O -לעומת B-SENT -זאת O -נשמרה O -רמת O -ה O -ייצוג O -ה O -יהודי O -. O -לפני B-SENT -ה O - -בחירות O -היו O -שבעה O -סנאטורים O -, O -ו O -גם O -אחרי O -הם O -יש O -שבעה O -סנאטורים O -. O -אחד B-SENT -מ O -הם O -, O -רודי O -בושוויץ O -, O -רפובליקן O - -מ O -מדינת O -מינסוטה O -, O -נוצח O -ב O -בחירות O -, O -אבל O -את O -מקומ O -ו O -תופס O -יהודי O -אחר O -מ O -מינסוטה O -, O -פול O -ולסטון O -. O - -יהדות B-SENT -ו O -של O -וולסטון O -נעשתה O -נושא O -מרכזי O -ב O -שלושת O -ה O -ימים O -ה O -אחרונים O -של O -מערכת O -ה O -בחירות O -. O -בושוויץ B-SENT -ניסה O -לשכנע O - -יהודים O -ב O -מינסוטה O -להצביע O -נגד O -וולסטון O -, O -באשר O -אין O -הוא O -יהודי O -נאמן O -, O -ו O -ילדי O -ו O -אינם O -גדלים O -כ O -יהודים O -. O - -מומחים B-SENT -פוליטיים O -ב O -מינסוטה O -העריכו O -ב O -יום O -ד O -כי O -ה O -התקפה O -על O -נאמנויות O -יו O -ה O -דתיות O -של O -וולסטון O -היתה O -גורם O -מרכזי O - -ב O -תבוסת O -בושוויץ O -. O -לפני B-SENT -ה O -התקפה O -הוביל O -בושוויץ O -ב O -יתרון O -של O -9 O -% O -. O -בתוך B-SENT -84 O -שעות O -התפוגג O -ה O -יתרון O - -. O -ב B-SENT -מוצאי O -ה O -בחירות O -, O -לפני O -ש O -הודה O -ב O -תבוסת O -ו O -, O -הופיע O -בושוויץ O -ב O -שידור O -טלוויזיה O -ו O -הטיל O -את O - -ה O -אשמה O -ב O -קשיי O -ו O -על O -שורה O -של O -גורמים O -חיצוניים O -. O -הוא B-SENT -לא O -חשב O -ש O -היה O -משהו O -פגום O -ב O -התנהגות O -ו O - -של O -של O -. O -וולסטון B-SENT -, O -פרופסור O -ל O -מדע O -ה O -מדינה O -ב O -אוניברסיטה O -מקומית O -, O -השכיל O -לנצל O -מצב O -רוח O -לאומי O -ב O -ארה"ב O - -נגד O -בעלי O -כהונות O -פוליטיות O -. O -תשדירי B-SENT -ה O -בחירות O -של O -ו O -תוארו O -כ O -שנונים O -ביותר O -ב O -ארה"ב O -. O -שניים B-SENT -מ O -הם O -היו O - -מערכונים O -קומיים O -. O -ב B-SENT -אחד O -נראה O -וולסטון O -מחפש O -את O -רודי O -בושוויץ O -על O -פני O -כל O -מינסוטה O -, O -ב O -ניסיון O -לקיים O -את O -ו O - -ויכוח O -פומבי O -. O -ב B-SENT -שני O -הוא O -נראה O -רץ O -ב O -קפיצות O -נוירוטיות O -על O -פני O -ציוני O -דרך O -ב O -חיי O -ו O -, O -ו O -מתנצל O - -ש O -אין O -ל O -ו O -זמן O -ללכת O -ב O -נחת O -מפני O -ש O -תקציב O -ה O -פרסום O -של O -ו O -עומד O -להיגמר O -. O -בושוויץ B-SENT -לא O -סבל O - -מ O -בעיות O -כ O -אלה O -. O -הוא B-SENT -מולטי O -- O -מיליונר O -, O -ה O -מסוגל O -תמיד O -להלוות O -ל O -עצמו O -כסף O -. O -חוץ B-SENT -מ O -זה O - -, O -מ O -כוח O -מעמד O -ו O -כ O -סנאטור O -הצליח O -לאסוף O -תרומות O -של O -7 O -מיליון O -דולר O -. O -וולסטון B-SENT -היה O -ה O -מועמד O -ה O -יחיד O - -ל O -סנאט O -השבוע O -ש O -הצליח O -להביס O -סנאטור O -מכהן O -. O -" B-SENT -יתרון O -ה O -כהונה O -" O -הוא O -עכשיו O -כל O -כך O -מוחלט O -, O -עד O - -ש O -ב O -כמה O -מירוצים O -לא O -טרחה O -ה O -מפלגה O -ה O -יריבה O -להציג O -אפילו O -מועמד O -נומינלי O -. O -סנאטורים B-SENT -ו O -צירי O -בית O -נבחרים O -מנצלים O - -את O -ה O -כהונה O -כדי O -לאסוף O -כמויות O -עצומות O -של O -כסף O -, O -ו O -להטביע O -ב O -הן O -כל O -יריב O -פוטנציאלי O -. O -תמוהה B-SENT -, O -ב O - -לשון O -המעטה O -, O -הערת O -ו O -של O -ה O -נשיא O -מיטראן O -לפני O -שבועות O -אחדים O -בלבד O -, O -ש O -ל O -דעת O -ו O -העמדת O -ו O -של O - -בוסקה O -ל O -דין O -תגרום O -נזק O -ציבורי O -. O -דמות B-SENT -מרכזית O -אחרת O -ב O -פרשה O -זו O -, O -דרקייה O -דה O -פלפואה O -, O -מי O -ש O -היה O - -ה O -נציב O -ה O -כללי O -ל O -שאלות O -ה O -יהודים O -, O -נמלט O -מ O -ארצ O -ו O -אחרי O -שחרור O -ה O -. O -שלא B-SENT -ב O -פני O -ו O - -נידון O -ל O -מוות O -, O -אך O -ה O -דבר O -לא O -הפריע O -ל O -ו O -לחיות O -ב O -גלות O -נינוחה O -ב O -ספרד O -. O -ב B-SENT -8791 O -הכריז O - -ב O -ראיון O -ענק O -ל O -שבועון O -" O -לאקספרס O -" O -, O -ש O -אמנם O -ב O -אושוויץ O -עסקו O -ה O -נאצים O -ב O -הגזה O -, O -אך O -לא O - -ב O -הגזת O -בני O -- O -אדם O -, O -אלא O -ב O -הגזת O -כינים O -. O -הוא B-SENT -מת O -לפני O -כמה O -שנים O -ב O -ספרד O -. O -הוא B-SENT -לא O - -הטריד O -את O -ה O -ממסד O -ה O -צרפתי O -ב O -הצהרות O -מביכות O -, O -ו O -שלטונות O -צרפת O -לא O -טרחו O -ש O -יוסגר O -ל O -ידי O -הם O -ל O - -צורך O -העמדת O -ו O -ל O -דין O -. O -שתי B-SENT -פרשיות O -אחרות O -, O -נוסף O -על O -זו O -של O -רנה O -בוסקה O -, O -נמצאות O -אף O -הן O -כיום O - -ב O -דיון O -משפטי O -מוקדם O -ש O -נמשך O -זמן O -רב O -: O -זו O -של O -מוריס O -פאפון O -ו O -זו O -של O -פול O -טובייה O -. O -פאפון B-SENT -, O - -ש O -כאמור O -שימש O -ממונה O -על O -מחוז O -זירונד O -ב O -זמן O -ה O -מלחמה O -, O -הורה O -על O -ביצוע O -ה O -אקציות O -נגד O -ה O -יהודים O -ב O - -יולי O -2491 O -. O -אחרי B-SENT -ה O -מלחמה O -, O -ב O -שנת O -8591 O -, O -הוא O -היה O -ממונה O -על O -מחוז O -פאריס O -. O -מאוחר B-SENT -יותר O -שימש O - -שר O -ב O -ממשלת O -ריימון O -באר O -ב O -ימי O -נשיאות O -ו O -של O -ואלרי O -זיסקאר O -- O -דאסטאן O -. O -ב B-SENT -3891 O -, O -בעקבות O -חקירה O -מחודשת O - -, O -נקבע O -כי O -יש O -מקום O -להגיש O -נגד O -ו O -כתב O -אישום O -בגין O -סיוע O -ל O -מעצר O -ם O -של O -יהודים O -כ O -דרישת O -ה O -גרמנים O - -. O -ב B-SENT -4891 O -הודיע O -ל O -ו O -שופט O -- O -חוקר O -כי O -הוגשו O -נגד O -ו O -תלונות O -נוספות O -. O -עד B-SENT -היום O -נמצאות O -תלונות O -הללו O - -ב O -שלב O -מוקדם O -של O -דיון O -בפני O -שופט O -- O -חוקר O -. O -בעקבות B-SENT -ה O -משפט O -נגד O -ראש O -ה O -גסטאפו O -ב O -לין O -, O -קלאוס O - -ברבי O -, O -עלה O -מחדש O -שמ O -ו O -של O -פול O -טביה O -. O -הוא B-SENT -היה O -מ O -מפקדי O -ה O -מיליציה O -ה O -צרפתית O -ה O -ידועה O -ל O - -שמצה O -ב O -ליון O -, O -נטל O -חלק O -פעיל O -ב O -אקציות O -של O -ה O -גסטאפו O -נגד O -לוחמי O -מחתרת O -ו O -יהודים O -, O -עינה O -עצורים O -, O - -שדד O -את O -רכוש O -ם O -, O -ו O -הורה O -על O -הוצאת O -ם O -ל O -הורג O -של O -כמה O -מ O -קורבנותי O -ו O -בלי O -משפט O -. O -אחרי B-SENT - -ה O -שחרור O -נעלמו O -עקבות O -יו O -. O -בית B-SENT -משפט O -ב O -ליון O -דן O -אות O -ו O -ל O -מוות O -שלא O -ב O -פני O -ו O -. O -ב B-SENT - -שנת O -1972 O -גילה O -תחקיר O -עיתונאי O -ש O -ב O -נובמבר O -1971 O -העניק O -ל O -ו O -ה O -נשיא O -זורז O -פומפידו O -חנינה O -, O -ו O -כי O -במשך O - -כל O -ה O -שנים O -נהנה O -ה O -פושע O -מ O -חסות O -ראשי O -ה O -כנסייה O -ה O -קתולית O -ב O -צרפת O -ו O -ב O -איטליה O -. O -ב B-SENT -שנת O - -3891 O -נתחדשה O -ה O -חקירה O -נגד O -טובייה O -, O -אך O -הוא O -שוב O -נעלם O -. O -ב B-SENT -ספטמבר O -1984 O -פורסמה O -ב O -עיתון O -אזורי O -מודעה O -על O - -מות O -ו O -כביכול O -, O -ו O -אף O -נמצא O -" O -מקום O -קבורת O -ו O -" O -. O -אך B-SENT -עדיין O -מחפשים O -אחרי O -טובייה O -ה O -חי O -. O - -אין B-SENT -פלא O -ש O -פרשת O -בוסקה O -הזכירה O -גם O -את O -פרשיות O -פאפון O -ו O -טובייה O -. O -ב B-SENT -מסיבת O -עיתונאים O -ש O -נערכה O -ב O -81 O -ב O - -אוקטובר O -השנה O -ב O -פאריס O -דיבר O -נשיא O -ה O -ליגה O -ל O -זכויות O -ה O -אדם O -, O -עו"ד O -איב O -זפה O -, O -על O -" O -חוסר O -ה O - -רצון O -ה O -פוליטי O -להביא O -לידי O -בירור O -את O -שלוש O -ה O -פרשיות O -של O -בוסקה O -, O -פאפון O -ו O -טובייה O -" O -. O -הוא B-SENT -אמר O -: O - -" O -שלושה O -תיקים O -אלה O -נמצאים O -ב O -נקודה O -מתה O -. O -אין B-SENT -זה O -מקרה O -ש O -שלושת O -ה O -אנשים O -הללו O -לא O -נשפטו O -עד O -כה O - -. O -אישים B-SENT -אלה O -מילאו O -תפקידים O -רמים O -ו O -בעלי O -חשיבות O -ב O -פוליטיקה O -, O -ב O -מינהל O -ו O -ב O -עולם O -ה O -עסקים O -. O -אני B-SENT - -סבור O -ש O -הם O -נהנים O -מ O -הזדהות O -מעמדית O -אמיתית O -. O -לא B-SENT -תובעים O -ל O -דין O -שר O -או O -ממונה O -על O -ה O -משטרה O -, O -משום O - -ש O -הוא O -שייך O -ל O -חוג O -של O -אנשים O -ש O -אינו O -רוצה O -ש O -יכבסו O -את O -ה O -כביסה O -ה O -מלוכלכת O -" O -. O -ו B-SENT -מה O - -בינתיים O -? O -פאפון B-SENT -, O -טובייה O -ו O -בוסקה O -חופשיים O -. O -כל B-SENT -אחד O -מה O -ם O -מתקרב O -ל O -גיל O -שמונים O -. O -ה B-SENT -רושם O -הוא O - -ש O -ה O -ממסד O -ה O -צרפתי O -מצפה O -ש O -ימותו O -ב O -שקט O -. O -מה B-SENT -ש O -ברור O -, O -ש O -הוא O -אינו O -מעוניין O -ב O -העמדת O - -ם O -ל O -דין O -. O -הוא B-SENT -חושש O -מ O -חשיפת O -סודות O -ה O -חבויים O -ב O -כספות O -, O -מפני O -עדויות O -חדשות O -, O -ו O -מפני O -זכרונות O - -ה O -עלולים O -לגלות O -שלדים O -ה O -חבויים O -שנים O -כה O -רבות O -ב O -ארונות O -שונים O -. O -ארבע B-SENT -ו O -חצי O -שעות O -צעדו O -אתמול O -ב O -ירושלים O - -עשרות O -אלפים O -אחרי O -ארונ O -ו O -של O -מאיר O -כהנא O -ב O -מסע O -הלווייה O -פרוע O -, O -רצוף O -קריאות O -הסתה O -ל O -פגיעות O -ב O -ערבים O -, O - -ב O -עיתונאים O -ו O -ב O -שוטרים O -. O -ב B-SENT -שעה O -1200 O -, O -כש O -החל O -ה O -קהל O -להתפזר O -, O -מנתה O -ה O -משטרה O -שלושה O -שוטרים O - -פצועים O -, O -אחד O -מ O -הם O -קשה O -, O -ו O -ארבעה O -ערבים O -עוברי O -- O -אורח O -ש O -נפצעו O -אף O -הם O -. O -ב B-SENT -שעה O -1400 O - -עדיין O -נמצא O -קהל O -ה O -רבבות O -תחת O -שליטה O -ו O -בקרה O -. O -החל B-SENT -מ O -שעות O -ה O -צהרים O -התקבצו O -אלפים O -סביב O -ישיבת O -ה O -רעיון O - -ה O -יהודי O -ב O -שכונת O -שמואל O -ה O -נביא O -. O -בני B-SENT -משפחת O -כהנא O -ישבו O -על O -מרפסת O -ה O -משקיפה O -על O -פתח O -ה O -ישיבה O -. O - -צלמי B-SENT -עיתונות O -רבים O -נדחקו O -על O -גג O -בטון O -סמוך O -. O -צוותי B-SENT -תקשורת O -אחרים O -הושמו O -מאחורי O -גדירות O -משטרה O -. O -ב B-SENT -שעה O -1500 O -הגיע O - -ה O -ארון O -ל O -מקום O -. O -מ B-SENT -ה O -רמקול O -בקע O -קול O -ו O -של O -אחד O -מ O -תלמידי O -ה O -ישיבה O -, O -ש O -קרא O -פסוקי O - -תהילים O -. O -ה B-SENT -ציבור O -חזר O -אחרי O -ו O -ב O -דבקות O -. O -קהל B-SENT -ה O -רבבות O -כלל O -כיפות O -סרוגות O -, O -חרדים O -, O -יהודים O -דתיים O - -מ O -ארה"ב O -ו O -אנשי O -שכונות O -. O -אנשי B-SENT -ה O -חברה O -קדישא O -ביקשו O -להעביר O -את O -אלונקת O -ה O -נפטר O -מ O -ה O -רכב O -אל O -ה O - -ישיבה O -. O -למעלה B-SENT -מ O -חצי O -שעה O -הם O -ניסו O -לעשות O -זאת O -, O -אך O -ללא O -הצלחה O -. O -בני B-SENT -ה O -משפחה O -, O -ה O -רבנים O - -, O -ראשי O -כך O -ו O -אפילו O -ה O -רב O -ה O -ראשי O -ל O -ישראל O -, O -ה O -ראשון O -ל O -ציון O -ה O -רב O -מרדכי O -אליהו O -, O - -התחננו O -ב O -פני O -ה O -קהל O -לפנות O -מעט O -דרך O -, O -אך O -לשווא O -. O -מאות B-SENT -צרו O -על O -רכב O -ה O -חברה O -קדישא O -, O -תוך O - -ש O -הם O -דוחפים O -איש O -את O -רעהו O -ו O -משלחים O -קללות O -ו O -גידופים O -לעבר O -אנשי O -ה O -תקשורת O -ה O -רבים O -. O -ה B-SENT -שר O -יובל O - -נאמן O -ו O -סגנית O -ו O -גאולה O -כהן O -, O -ש O -ביקשו O -להספיד O -את O -ה O -רב O -כהנא O -, O -גורשו O -מ O -ה O -מקום O -. O -ב B-SENT - -קהל O -נראו O -חבר O -ה O -כנסת O -אליקים O -העצני O -, O -ה O -שר O -יצחק O -פרץ O -, O -ה O -ח"כ O -לשעבר O -מאיר O -כהן O -אבידוב O -, O -ד"ר O - -ישראל O -אלדד O -ו O -דמויות O -רבות O -נוספות O -מ O -חוגי O -ה O -ימין O -. O -ב B-SENT -2115 O -ניסה O -רכב O -ה O -חברה O -קדישא O -לדחוף O -ב O -עדינות O - -את O -ה O -צרים O -עלי O -ו O -. O -ה B-SENT -תוצאה O -: O -פצוע O -ש O -נדרס O -. O -ל B-SENT -קול O -צפירות O -ה O -אמבולנס O -ש O -מיהר O -עמ O - -ו O -ל O -בית O -ה O -חולים O -, O -פתח O -ה O -רב O -ה O -ראשי O -את O -שורת O -ה O -הספדים O -. O -ה B-SENT -רב O -ה O -ראשי O -עמד O - -על O -תכונות O -יו O -של O -כהנא O -: O -" O -ה O -חסד O -של O -של O -, O -ה O -צדקה O -של O -הוא O -ו O -ראיית O -ה O -חיים O -של O - -של O -" O -, O -ו O -אחר O -כך O -פונה O -ל O -קהל O -: O -" O -ה O -קב"ה O -יקום O -דמ O -ו O -. O -לא B-SENT -להפריע O -ל O -כוחות O - -ה O -ביטחון O -. O -להשאיר B-SENT -את O -ה O -נקמה O -ל O -אלוקים O -" O -. O -ה B-SENT -רב O -נחמן O -כהנא O -, O -אחי O -ו O -של O -מאיר O -כהנא O - -, O -הזכיר O -ל O -ציבור O -תוך O -בכי O -, O -כי O -" O -דמ O -ו O -של O -כהנא O -נשפך O -על O -רקע O -הר O -- O -ה O -בית O -. O - -ה B-SENT -דם O -ה O -זה O -רותח O -. O -על B-SENT -ה O -דם O -ה O -זה O -אין O -כפרה O -" O -. O -רק B-SENT -ב O -דברי O -ו O -של O -ה O - -רב O -אברהם O -טולדאנו O -, O -משגיח O -ב O -ישיבת O -ה O -רעיון O -ה O -יהודי O -ו O -מספר O -4 O -ב O -רשימת O -כך O -ל O -כנסת O -, O -היו O - -כבר O -הוראות O -מעשיות O -: O -" O -אלוקים O -ייקום O -דמ O -ו O -ו O -אנו O -ניקום O -אות O -ו O -. O -אל B-SENT -נקמות O -ה O -. O -עכשיו B-SENT -עת O - -להרוג O -. O -זו B-SENT -ה O -עת O -להרוג O -רבות O -י O -. O -נצפה B-SENT -ל O -שלום O -, O -אך O -אינך O -יכול O -להגיע O -ל O -שלום O -, O -שלא O - -דרך O -ה O -מלחמה O -. O -עת B-SENT -ל O -טוב O -ו O -עת O -ל O -רע O -: O -אינך O -יכול O -להגיע O -ל O -טוב O -, O -אם O -אתה O -לא O - -מבער O -את O -ה O -רע O -. O -זה B-SENT -ה O -רגע O -. O -זה B-SENT -ה O -זמן O -לנקום O -נקמת O -ו O -של O -כל O -יהודי O -" O -. O -דברי B-SENT - -ו O -של O -יקותיאל O -בן O -- O -יעקב O -, O -איש O -כך O -ה O -מקורב O -ביותר O -ל O -רב O -כהנא O -, O -הם O -ה O -מפורשים O -ביותר O -, O - -ו O -הוא O -פנה O -אל O -רב O -ו O -, O -כאילו O -עוד O -היה O -חי O -: O -" O -ה O -רב O -כהנא O -רצחו O -אות O -ך O -פעמיים O -. O - -כאן B-SENT -ב O -ארץ O -פסלו O -אות O -אתה O -ו O -ב O -ארה"ב O -רצחו O -אות O -אות O -. O -ה B-SENT -תקשורת O -ה O -עויינת O -ממשיכה O -לרצוח O -אות O -ך O - -פעם O -שלישית O -. O -נבכה B-SENT -ו O -נפסיק O -לבכות O -ו O -נקיים O -את O -ה O -תורה O -ש O -לימדת O -אות O -נו O -נקמה O -. O -ניתן B-SENT -את O -רשות O - -ה O -דיבור O -ל O -חבר O -תת O -- O -מקלע O -ל O -חבר O -סכין O -. O -שלום B-SENT -ה O -רב O -כהנא O -" O -. O -אט B-SENT -אט O -התחיל O -ה O - -המון O -לצעוד O -אל O -מ O -חוץ O -ל O -סמטאות O -לעבר O -ה O -כביש O -ה O -ראשי O -. O -תחילה B-SENT -ב O -שקט O -ו O -אחר O -כך O -תוך O -קריאות O - -גוברות O -ו O -הולכות O -: O -" O -מוות O -ל O -ערבים O -" O -, O -" O -מוות O -ל O -שמאלנים O -" O -, O -" O -רוצים O -נקמה O -" O -. O - -ה B-SENT -קהל O -ה O -זה O -היה O -צמא O -ל O -דם O -ו O -הוא O -חיפש O -ערבים O -. O -מדי B-SENT -פעם O -ב O -פעם O -פרצה O -קבוצה O -של O -כמה O - -מאות O -הצידה O -כאשר O -נדמה O -היה O -ש O -גילתה O -ערבי O -. O -אבל B-SENT -ה O -אכזבה O -רבה O -. O -בעלי B-SENT -מפעלים O -רבים O -על O -נתיב O -ה O -הלווייה O - -נעלו O -את O -פועלי O -הם O -מאחורי O -סוגר O -ו O -בריח O -. O -אחדים B-SENT -מ O -הם O -עשו O -שטות O -ו O -הציצו O -מבעד O -ל O -חלונות O -; O -בתוך O - -דקות O -נופצו O -שמשות O -. O -מאחור B-SENT -צרו O -מאות O -על O -בניין O -ה O -טלוויזיה O -. O -פרשי B-SENT -משטרה O -דחקו O -אות O -ם O -לאחור O -. O -שוטר B-SENT -נפגע O - -מ O -אבן O -ב O -ראש O -ו O -. O -קבוצה B-SENT -אחרת O -של O -מפגינים O -פרצה O -ל O -מרכז O -ה O -מסחרי O -סנטר O -1 O -, O -ליד O -ה O -תחנה O - -ה O -מרכזית O -. O -יושבי B-SENT -בתי O -ה O -קפה O -נבעתו O -. O -אם B-SENT -עם O -תינוקת O -ה O -פרצה O -ב O -זעקות O -. O -מיד B-SENT -אחר O -כך O -נופצו O - -חלונות O -ראווה O -של O -חנויות O -ב O -מרכז O -. O -ה B-SENT -שוטרים O -הגיעו O -רק O -אחרי O -כמה O -דקות O -. O -מול B-SENT -בית O -מספר O -3 O -ב O -רחוב O - -גבעת O -שאול O -אני O -עד O -ל O -אירוע O -ה O -חמור O -ביותר O -. O - diff --git a/hebpipe/lib/flair_pos_tagger.py b/hebpipe/lib/flair_pos_tagger.py deleted file mode 100644 index d5e59d8..0000000 --- a/hebpipe/lib/flair_pos_tagger.py +++ /dev/null @@ -1,420 +0,0 @@ -""" -flair_pos_tagger.py - -This module trains flair sequence labelers to predict POS and deprel for OTHER modules. -""" - - -from argparse import ArgumentParser -from flair.data import Corpus, Sentence -from flair.datasets import ColumnCorpus -from flair.embeddings import OneHotEmbeddings, TransformerWordEmbeddings, StackedEmbeddings -from flair.models import SequenceTagger -import os, sys, io -from glob import glob -from random import seed, shuffle -seed(42) - -script_dir = os.path.dirname(os.path.realpath(__file__)) + os.sep -model_dir = script_dir + ".." + os.sep + "models" + os.sep -IAHLT_ROOT = "IAHLT_HTB" + os.sep # Path to IAHLT HTB repo -TARGET_FEATS = {"Gender","Number","Tense","VerbForm","Voice","HebBinyan","Definite"} - -class FlairTagger: - - def __init__(self, train=False, morph=False): - if not train: - if morph: - self.model = SequenceTagger.load(model_dir + "heb.morph") - else: - self.model = SequenceTagger.load(model_dir + "heb.flair") - - @staticmethod - def make_seg_data(): - prefixes = {"ב","כ","מ","ל","ה",} - suffixes = {"ו","ה","י","ך","ם","ן","הם","הן","כם","כן","יו"} - def segs2tag(segs): - tag = "X" - if len(segs) == 2: - if segs[0] == "ו": - tag = "W" - elif segs[0] in ["ש","כש"]: - tag = "S" - elif segs[0] in prefixes: - tag = "B" - if segs[1] in suffixes: - tag += "Y" - elif len(segs) == 3: - if segs[0] == "ו": - tag = "W" - elif segs[0] in ["ש","כש"]: - tag = "S" - elif segs[0] in prefixes: - tag = "B" - if segs[1] in ["ש","כש"]: - tag += "S" - elif segs[1] in prefixes: - tag += "B" - if segs[2] in suffixes: - tag += "Y" - elif len(segs) > 3: - if segs[0] == "ו": - tag = "W" - elif segs[0] in ["ש","כש"]: - tag = "S" - if segs[1] in ["ש","כש"]: - tag += "S" - elif segs[1] in prefixes: - tag += "B" - if segs[2] in prefixes: - tag += "B" - if segs[-1] in suffixes: - tag += "Y" - if tag == "BS": - tag = "BB" # מ+ש, כ+ש - elif tag == "WSY": # ושעיקרה - tag = "WBY" - elif "XS" in tag: - tag = "X" - return tag - - def conllu2segs(conllu, target="affixes"): - super_length = 0 - limit = 4 # Maximum bound group length in units, discard sentences with longer groups - sents = [] - words = [] - labels = [] - word = [] - max_len = 0 - lines = conllu.split("\n") - for line in lines: - if "\t" in line: - fields = line.split("\t") - if "-" in fields[0]: - start, end = fields[0].split("-") - super_length = int(end) - int(start) + 1 - else: - if super_length > 0: - word.append(fields[1]) - super_length -= 1 - if super_length == 0: - words.append("".join(word)) - if target=="count": - labels.append(str(len(word))) - else: - labels.append(segs2tag(word)) - if len(word) > max_len: - max_len = len(word) - word = [] - else: - words.append(fields[1]) - labels.append("O") - elif len(line) == 0 and len(words) > 0: - if max_len > limit or " " in "".join(words): # Reject sentence - max_len = 0 - else: - sents.append("\n".join([w + "\t" + l for w, l, in zip(words,labels)])) - words = [] - labels = [] - return "\n\n".join(sents) - - files = glob(IAHLT_ROOT + "seg" + os.sep + "*.conllu") - data = "" - for file_ in files: - data += conllu2segs(io.open(file_,encoding="utf8").read()) + "\n\n" - sents = data.strip().split("\n\n") - sents = list(set(sents)) - shuffle(sents) - with io.open("tagger" + os.sep + "heb_train_seg.txt", 'w', encoding="utf8",newline="\n") as f: - f.write("\n\n".join(sents[:int(-len(sents)/10)])) - with io.open("tagger" + os.sep + "heb_dev_seg.txt", 'w', encoding="utf8",newline="\n") as f: - f.write("\n\n".join(sents[int(-len(sents)/10):])) - with io.open("tagger" + os.sep + "heb_test_seg.txt", 'w', encoding="utf8",newline="\n") as f: - f.write("\n\n".join(sents[int(-len(sents)/10):])) - - @staticmethod - def make_pos_data(tags=False): - def filter_morph(feats): - if feats == "_": - return "O" - else: - annos = [] - for f in feats.split("|"): - k, v = f.split("=") - if k in TARGET_FEATS: - annos.append(k+"="+v) - if len(annos) > 0: - return "|".join(annos) - else: - return "O" - - files = glob(IAHLT_ROOT + "*.conllu") - train = test = dev = "" - super_tok_len = 0 - super_tok_start = False - suff = "_morph" if tags else "" - for file_ in files: - output = [] - lines = io.open(file_,encoding="utf8").readlines() - for line in lines: - if "\t" in line: - fields = line.split("\t") - if "." in fields[0]: - continue - if "-" in fields[0]: - super_tok_start = True - start,end = fields[0].split("-") - super_tok_len = int(end)-int(start) + 1 - continue - if super_tok_start: - super_tok_position = "B" - super_tok_start = False - super_tok_len -= 1 - elif super_tok_len > 0: - super_tok_position = "I" - super_tok_len -= 1 - if super_tok_len == 0: - super_tok_position = "E" - else: - super_tok_position = "O" - if tags: - morph = filter_morph(fields[5]) - output.append(fields[1] + "\t" + super_tok_position + "\t" + fields[4] + "\t" + morph) - else: - output.append(fields[1] + "\t" + super_tok_position + "\t" + fields[4]) - elif len(line.strip()) == 0: - if output[-1] != "": - output.append("") - if "dev" in file_: - dev += "\n".join(output) - elif "test" in file_: - test += "\n".join(output) - else: - train += "\n".join(output) - with io.open("tagger" + os.sep + "heb_train"+suff+".txt", 'w', encoding="utf8",newline="\n") as f: - f.write(train) - with io.open("tagger" + os.sep + "heb_dev"+suff+".txt", 'w', encoding="utf8",newline="\n") as f: - f.write(dev) - with io.open("tagger" + os.sep + "heb_test"+suff+".txt", 'w', encoding="utf8",newline="\n") as f: - f.write(test) - - def train(self, cuda_safe=True, positional=True, tags=False, seg=False): - if cuda_safe: - # Prevent CUDA Launch Failure random error, but slower: - import torch - torch.backends.cudnn.enabled = False - # Or: - # os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - # 1. get the corpus - # this is the folder in which train, test and dev files reside - data_folder = "tagger" + os.sep - - # init a corpus using column format, data folder and the names of the train, dev and test files - - # define columns - columns = {0: "text", 1: "super", 2: "pos"} - suff = "" - if positional: - columns[1] = "super" - columns[2] = "pos" - if tags: - columns[3] = "morph" - suff = "_morph" - if seg: - columns[1] = "seg" - del columns[2] - self.make_seg_data() - suff = "_seg" - else: - self.make_pos_data(tags=tags) - - corpus: Corpus = ColumnCorpus( - data_folder, columns, - train_file="heb_train"+suff+".txt", - test_file="heb_test"+suff+".txt", - dev_file="heb_dev"+suff+".txt", - ) - - # 2. what tag do we want to predict? - tag_type = 'pos' if not tags else "morph" - if seg: - tag_type = "seg" - - # 3. make the tag dictionary from the corpus - tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) - print(tag_dictionary) - - # 4. initialize embeddings - embeddings: TransformerWordEmbeddings = TransformerWordEmbeddings('onlplab/alephbert-base',) - if positional: - positions: OneHotEmbeddings = OneHotEmbeddings(corpus=corpus, field="super", embedding_length=5) - if tags: - tag_emb: OneHotEmbeddings = OneHotEmbeddings(corpus=corpus, field="pos", embedding_length=17) - stacked: StackedEmbeddings = StackedEmbeddings([embeddings,positions,tag_emb]) - else: - stacked: StackedEmbeddings = StackedEmbeddings([embeddings, positions]) - elif not seg: - if tags: - tag_emb: OneHotEmbeddings = OneHotEmbeddings(corpus=corpus, field="pos", embedding_length=17) - stacked: StackedEmbeddings = StackedEmbeddings([embeddings,tag_emb]) - else: - stacked = embeddings - else: - stacked = embeddings - - # 5. initialize sequence tagger - tagger: SequenceTagger = SequenceTagger(hidden_size=256, - embeddings=stacked, - tag_dictionary=tag_dictionary, - tag_type=tag_type, - use_crf=True, - use_rnn=True) - - # 6. initialize trainer - from flair.trainers import ModelTrainer - - trainer: ModelTrainer = ModelTrainer(tagger, corpus) - - # 7. start training - trainer.train(script_dir + "pos-dependencies" + os.sep + 'flair_tagger', - learning_rate=0.1, - mini_batch_size=15, - max_epochs=150) - - def predict(self, in_path=None, in_format="flair", out_format="conllu", as_text=False, tags=False, seg=False): - model = self.model - tagcol = 4 - - if as_text: - data = in_path - #data = (data + "\n").replace("\n", "").replace("\n", "\n").strip() - else: - data = io.open(in_path,encoding="utf8").read() - sents = [] - words = [] - positions = [] - true_tags = [] - true_pos = [] - super_tok_start = False - super_tok_len = 0 - data = data.strip() + "\n" # Ensure final new line for last sentence - for line in data.split("\n"): - if len(line.strip())==0: - if len(words) > 0: - sents.append(Sentence(" ".join(words),use_tokenizer=lambda x:x.split(" "))) - for i, word in enumerate(sents[-1]): - if not seg: - word.add_label("super",positions[i]) - if tags: - word.add_label("pos",true_pos[i]) - words = [] - positions = [] - true_pos = [] - else: - if in_format == "flair": - words.append(line.split("\t")[0]) - if not seg: - positions.append(line.split("\t")[1]) - if tags: - true_pos.append(line.split("\t")[2]) - true_tags.append(line.split("\t")[3]) if "\t" in line else true_tags.append("") - else: - true_tags.append(line.split("\t")[2]) if "\t" in line else true_tags.append("") - else: - if "\t" in line: - fields = line.split("\t") - if "." in fields[0]: - continue - if "-" in fields[0]: - super_tok_start = True - start, end = fields[0].split("-") - super_tok_len = int(end) - int(start) + 1 - continue - if super_tok_start: - super_tok_position = "B" - super_tok_start = False - super_tok_len -= 1 - elif super_tok_len > 0: - super_tok_position = "I" - super_tok_len -= 1 - if super_tok_len == 0: - super_tok_position = "E" - else: - super_tok_position = "O" - words.append(line.split("\t")[1]) - positions.append(super_tok_position) - true_tags.append(line.split("\t")[tagcol]) - true_pos.append(line.split("\t")[4]) - - # predict tags and print - model.predict(sents)#, all_tag_prob=True) - - preds = [] - scores = [] - words = [] - for i, sent in enumerate(sents): - for tok in sent.tokens: - if tags: - pred = tok.labels[2].value - score = str(tok.labels[2].score) - else: - pred = tok.labels[1].value - score = str(tok.labels[1].score) - preds.append(pred) - scores.append(score) - words.append(tok.text) - - toknum = 0 - output = [] - #out_format="diff" - for i, sent in enumerate(sents): - tid=1 - if i>0 and out_format=="conllu": - output.append("") - for tok in sent.tokens: - pred = preds[toknum] - score = str(scores[toknum]) - if len(score)>5: - score = score[:5] - if out_format == "conllu": - pred = pred if not pred == "O" else "_" - fields = [str(tid),tok.text,"_",pred,pred,"_","_","_","_","_"] - output.append("\t".join(fields)) - tid+=1 - elif out_format == "xg": - output.append("\t".join([pred, tok.text, score])) - else: - true_tag = true_tags[toknum] - corr = "T" if true_tag == pred else "F" - output.append("\t".join([pred, true_tag, corr, score, tok.text, true_pos[toknum]])) - toknum += 1 - - if as_text: - return "\n".join(output) - else: - ext = "xpos.conllu" if out_format == "conllu" else "txt" - partition = "test" if "test" in in_path else "dev" - with io.open(script_dir + "pos-dependencies" +os.sep + "flair-"+partition+"-pred." + ext,'w',encoding="utf8",newline="\n") as f: - f.write("\n".join(output)) - - -if __name__ == "__main__": - p = ArgumentParser() - p.add_argument("-m","--mode",choices=["train","predict"],default="predict") - p.add_argument("-f","--file",default=None,help="Blank for training, blank predict for eval, or file to run predict on") - p.add_argument("-p","--positional_embeddings",action="store_true",help="Whether to use positional embeddings within supertokens (MWTs)") - p.add_argument("-t","--tag_embeddings",action="store_true",help="Whether to use POS tag embeddings for morphology prediction") - p.add_argument("-s","--seg",action="store_true",help="Whether to train segmentation instead of tagging") - p.add_argument("-i","--input_format",choices=["flair","conllu"],default="flair",help="flair two column training format or conllu") - p.add_argument("-o","--output_format",choices=["flair","conllu","xg"],default="conllu",help="flair two column training format or conllu") - - opts = p.parse_args() - - if opts.mode == "train": - tagger = FlairTagger(train=True) - tagger.train(positional=opts.positional_embeddings, tags=opts.tag_embeddings, seg=opts.seg) - else: - tagger = FlairTagger(train=False) - tagger.predict(in_format=opts.input_format, out_format=opts.output_format, - in_path=opts.file) diff --git a/hebpipe/lib/flair_sent_splitter.py b/hebpipe/lib/flair_sent_splitter.py deleted file mode 100644 index ea3d8c7..0000000 --- a/hebpipe/lib/flair_sent_splitter.py +++ /dev/null @@ -1,441 +0,0 @@ -from flair.data import Corpus, Sentence -from flair.datasets import ColumnCorpus -from flair.embeddings import TransformerWordEmbeddings -from flair.models import SequenceTagger -import flair - -import os, sys, re, io - -script_dir = os.path.dirname(os.path.realpath(__file__)) + os.sep -model_dir = script_dir + ".." + os.sep + "models" + os.sep - -import conllu -from collections import OrderedDict, defaultdict - -try: - from .reorder_sgml import reorder -except ImportError: - from reorder_sgml import reorder - -TAGS = [ - "sp", - "table", - "row", - "cell", - "head", - "p", - "figure", - "caption", - "list", - "item", - "quote", - "s", - "q", - "hi", - "sic", - "ref", - "date", - "incident", - "w", -] -# These XML tags force a sentence break in the data, you can add more here: -BLOCK_TAGS = ["sp", "head", "p", "figure", "caption", "list", "item"] -BLOCK_TAGS += ["❦❦❦"] # reserved tag for sentences in input based on newlines -OPEN_SGML_ELT = re.compile(r"^<([^/ ]+)( .*)?>$") -CLOSE_SGML_ELT = re.compile(r"^$") - - -def maximal_nontoken_span_end(sgml_list, i): - """Return j such that sgml_list[i:j] does not contain tokens - and no element that is begun in the MNS is closed in it.""" - opened = [] - j = i - while j < len(sgml_list): - line = sgml_list[j] - open_match = re.match(OPEN_SGML_ELT, line) - close_match = re.match(CLOSE_SGML_ELT, line) - if not (open_match or close_match): - break - if open_match: - opened.append(open_match.groups()[0]) - if close_match and close_match.groups()[0] in opened: - break - j += 1 - return j - - -def fix_malformed_sentences(sgml_list): - """ - Fixing malformed SGML seems to boil down to two cases: - - (1) The sentence is interrupted by the close of a tag that opened before it. In this case, - update the s boundaries so that we close and begin sentences at the close tag: - - - ... - ... - ... - ... ==> - - ... - ... - - - (2) Some tag opened inside of the sentence and has remained unclosed at the time of sentence closure. - In this case, we choose not to believe the sentence split, and merge the two sentences: - - - ... - ... - ... - ==> ... - ... - ... - ... - ... - - """ - tag_opened = defaultdict(list) - i = 0 - while i < len(sgml_list): - line = sgml_list[i].strip() - open_match = re.search(OPEN_SGML_ELT, line) - close_match = re.search(CLOSE_SGML_ELT, line) - if open_match: - tag_opened[open_match.groups()[0]].append(i) - elif close_match: - tagname = close_match.groups()[0] - j = maximal_nontoken_span_end(sgml_list, i + 1) - mns = sgml_list[i:j] - - # case 1: we've encountered a non-s closing tag. If... - if ( - tagname != "s" # the closing tag is not an s - and len(tag_opened["s"]) > 0 # and we're in a sentence - and len(tag_opened[tagname]) > 0 - and len(tag_opened["s"]) > 0 # and the sentence opened after the tag - and tag_opened[tagname][-1] < tag_opened["s"][-1] - and "" not in mns # the sentence is not closed in the mns - ): - # end sentence here and move i back to the line we were looking at - sgml_list.insert(i, "") - i += 1 - # open a new sentence at the end of the mns and note that we are no longer in the sentence - sgml_list.insert(j + 1, "") - tag_opened["s"].pop(-1) - # we have successfully closed this tag - tag_opened[tagname].pop(-1) - # case 2: s closing tag and there's some tag that opened inside of it that isn't closed in time - elif tagname == "s" and any( - e != "s" and f"" not in mns - for e in [ - e - for e in tag_opened.keys() - if len(tag_opened[e]) > 0 and len(tag_opened["s"]) > 0 and tag_opened[e][-1] > tag_opened["s"][-1] - ] - ): - # some non-s element opened within this sentence and has not been closed even in the mns - assert "" in mns - sgml_list.pop(i) - i -= 1 - sgml_list.pop(i + mns.index("")) - else: - tag_opened[tagname].pop(-1) - i += 1 - return sgml_list - - -def is_sgml_tag(line): - return line.startswith("<") and line.endswith(">") - - -def unescape(token): - token = token.replace(""", '"') - token = token.replace("<", "<") - token = token.replace(">", ">") - token = token.replace("&", "&") - token = token.replace("'", "'") - return token - - -def tokens2conllu(tokens): - tokens = [ - OrderedDict( - (k, v) - for k, v in zip( - conllu.parser.DEFAULT_FIELDS, - [i + 1, unescape(token)] + ["_" for i in range(len(conllu.parser.DEFAULT_FIELDS) - 1)], - ) - ) - for i, token in enumerate(tokens) - ] - tl = conllu.TokenList(tokens) - return tl - - -class FlairSentSplitter: - - def __init__(self, model_path=None, span_size=20, stride_size=10): - - self.span_size = span_size # Each shingle is 20 tokens by default - self.stride_size = stride_size # Tag a shingle every stride_size tokens - self.test_dependencies() - if model_path is not None: - self.load_model(model_path) - else: - self.model = None - - def load_model(self, path=None): - if path is None: - path = model_dir + "heb.sent" - if not os.path.exists(path): - raise FileNotFoundError("Cannot find sentence splitter model heb.sent at " +path) - self.model = SequenceTagger.load(path) - - def test_dependencies(self): - # Check we have flair - import flair - - def train(self, training_dir=None): - from flair.trainers import ModelTrainer - - if training_dir is None: - training_dir = script_dir + "flair" + os.sep - - # define columns - columns = {0: "text", 1: "ner"} - - # this is the folder in which train, test and dev files reside - data_folder = training_dir + "data" - - # init a corpus using column format, data folder and the names of the train, dev and test files - # note that training data should be unescaped, i.e. tokens like "&", not "&" - corpus: Corpus = ColumnCorpus( - data_folder, - columns, - train_file="sent_train.txt", - test_file="sent_test.txt", - dev_file="sent_dev.txt", - ) - - print(corpus) - - tag_type = "ner" - tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) - print(tag_dictionary) - - # initialize embeddings - embeddings: TransformerWordEmbeddings = TransformerWordEmbeddings('onlplab/alephbert-base') - - tagger: SequenceTagger = SequenceTagger( - hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, - ) - - trainer: ModelTrainer = ModelTrainer(tagger, corpus) - - trainer.train(training_dir, learning_rate=0.1, mini_batch_size=32, max_epochs=50) - self.model = tagger - - def predict(self, tt_sgml, outmode="binary"): - def is_tok(sgml_line): - return len(sgml_line) > 0 and not (sgml_line.startswith("<") and sgml_line.endswith(">")) - - def is_sent(line): - return line in ["", ""] or line.startswith("= 0 and i < len(toks): - mapping[i].add((idx - self.stride_size, idx + self.span_size - self.stride_size, snum)) - idx += self.stride_size - snum += 1 - - for idx in mapping: - best = self.span_size - for m in mapping[idx]: - start, end, snum = m - dist_to_end = end - idx - dist_to_start = idx - start - delta = abs(dist_to_end - dist_to_start) - if delta < best: - best = delta - final_mapping[idx] = (snum, idx - start) # Get sentence number and position in sentence - - # Predict - preds = self.model.predict(spans) - - if preds is None: # Newer versions of flair have void predict method, use modified Sentence list - preds = spans - - labels = [] - for idx in final_mapping: - snum, position = final_mapping[idx] - if str(flair.__version__).startswith("0.4"): - label = 0 if preds[snum].tokens[position].tags["ner"].value == "O" else 1 - else: - label = 0 if preds[snum].tokens[position].labels[0].value == "O" else 1 - - labels.append(label) - - if outmode == "binary": - return labels - - # Generate edited XML if desired - output = [] - counter = 0 - first = True - for line in tt_sgml.strip().split("\n"): - if is_sent(line): # Remove existing sentence tags - continue - if is_tok(line): - if labels[counter] == 1: - if not first: - output.append("") - output.append("") - first = False - counter += 1 - output.append(line) - output.append("") # Final closing - - output = reorder("\n".join(output)) - - return output.strip() + "\n" - - def split(self, xml_data): - def wrap_words(xml): - output = [] - lines = xml.split("\n") - for line in lines: - if len(line)>0 and not (line.startswith("<") and line.endswith(">") and not line == "|"): - line = line.replace("|","\n") - line = "<❦♥>\n" + line + "\n" - output.append(line) - return "\n".join(output) - - def collapse_words(sgml): - output = [] - buffer = [] - for line in sgml.split("\n"): - if line in ['<❦♥>',''] or not is_sgml_tag(line): - buffer.append(line) - else: - output.append(line) - if line == "": - piped = "|".join(buffer) - if not (buffer[1] == "|" and len(buffer) == 3): # Actual pipe as token - piped = piped.replace('|','').replace('<❦♥>|','<❦♥>') - output.append(piped) - buffer = [] - return "\n".join(output) - - # Sometimes the tokenizer doesn't newline every elt - xml_data = xml_data.replace("><", ">\n<") - # Ad hoc fix for a tokenization error - xml_data = xml_data.replace("°<", "°\n<") - # Remove empty elements? - # for elt in TAGS: - # xml_data = xml_data.replace(f"<{elt}>\n\n", "") - xml_data = wrap_words(xml_data) - - # don't feed the sentencer our pos and lemma predictions, if we have them - no_pos_lemma = re.sub(r"([^\n\t]*?)\t[^\n\t]*?\t[^\n\t]*?\n", r"\1\n", xml_data) - split_indices = self.predict(no_pos_lemma) - - # for xml - counter = 0 - splitted = [] - opened_sent = False - para = True - - xml_data = xml_data.replace("","<❦❦❦>").replace("","") - for line in xml_data.strip().split("\n"): - if not is_sgml_tag(line): - # Token - if split_indices[counter] == 1 or para: - if opened_sent: - rev_counter = len(splitted) - 1 - while is_sgml_tag(splitted[rev_counter]) and rev_counter > 0: - rev_counter -= 1 - if rev_counter > 0: - splitted.insert(rev_counter + 1, "") - splitted.append("") - opened_sent = True - para = False - counter += 1 - elif any(f"<{elt}>" in line for elt in BLOCK_TAGS) or any( - f"" in line for elt in BLOCK_TAGS - ): # New block, force sentence split - para = True - splitted.append(line) - - if opened_sent: - rev_counter = len(splitted) - 1 - while is_sgml_tag(splitted[rev_counter]): - rev_counter -= 1 - splitted.insert(rev_counter + 1, "") - - lines = "\n".join(splitted) - lines = re.sub(r'\n?','',lines) - lines = reorder(lines, priorities=["s","❦♥"]) - lines = collapse_words(lines) - - # destroy any xml inside supertokens - while re.search(r'(<❦♥>[^<>]*)<[^❦♥]+>',lines) is not None: - lines = re.sub(r'(<❦♥>[^<>]*)<[^❦♥]+>([^<>]*)',r'\1\2',lines) - - # remove word and sent wrappers - lines = re.sub(r'','',lines) - - lines = reorder(lines) - lines = fix_malformed_sentences(lines.split("\n")) - lines = "\n".join(lines) - lines = reorder(lines) - - return lines - - -if __name__ == "__main__": - from argparse import ArgumentParser - - p = ArgumentParser() - p.add_argument("--file", default=None, help="TT SGML file to test sentence splitting on, or training dir") - p.add_argument("-m", "--mode", choices=["test", "train"], default="test") - p.add_argument( - "-o", - "--out_format", - choices=["binary", "sgml"], - help="output list of binary split indices or TT SGML", - default="sgml", - ) - - opts = p.parse_args() - sentencer = FlairSentSplitter() - if opts.mode == "train": - sentencer.train(training_dir=opts.file) - else: - sgml = io.open(opts.file, encoding="utf8").read() - result = sentencer.predict(sgml, outmode=opts.out_format) - print(result) From 10cd30cc03d32f752e5440bd3aa20e6486ca31bc Mon Sep 17 00:00:00 2001 From: amir-zeldes Date: Tue, 18 Oct 2022 10:04:48 -0400 Subject: [PATCH 32/32] bump version to 3.0 --- hebpipe/lib/_version.py | 2 +- setup.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/hebpipe/lib/_version.py b/hebpipe/lib/_version.py index b92e3cd..261d931 100644 --- a/hebpipe/lib/_version.py +++ b/hebpipe/lib/_version.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -__version__ = "2.0.1.0" +__version__ = "3.0.0.0" __author__ = "Amir Zeldes" __copyright__ = "Copyright 2018-2022, Amir Zeldes" __license__ = "Apache 2.0 License" diff --git a/setup.py b/setup.py index 419a20b..29113ce 100644 --- a/setup.py +++ b/setup.py @@ -3,16 +3,15 @@ setup( name = 'hebpipe', packages = find_packages(), - version = '2.0.1.0', + version = '3.0.0.0', description = 'A pipeline for Hebrew NLP', author = 'Amir Zeldes', author_email = 'amir.zeldes@georgetown.edu', package_data = {'':['README.md','LICENSE.md','requirements.txt'],'hebpipe':['lib/*','data/*','bin/*','models/models_go_here.txt','models/stanza/stanza_models_here.txt']}, - install_requires=['numpy','transformers==3.5.1','torch==1.6.0','pandas','scipy','joblib','xgboost==0.81','rftokenizer','depedit','xmltodict', - 'diaparser==1.1.2','flair==0.6.1','stanza','conllu'], + install_requires=['requests','numpy','transformers==3.5.1','torch==1.6.0','pandas','scipy','joblib','xgboost==0.81','rftokenizer','depedit','xmltodict', 'diaparser==1.1.2','flair==0.6.1','stanza','conllu'], url = 'https://github.com/amir-zeldes/HebPipe', license='Apache License, Version 2.0', - download_url = 'https://github.com/amir-zeldes/HebPipe/releases/tag/v2.0.1.0', + download_url = 'https://github.com/amir-zeldes/HebPipe/releases/tag/v3.0.0.0', keywords = ['NLP', 'Hebrew', 'segmentation', 'tokenization', 'tagging', 'parsing','morphology','POS','lemmatization'], classifiers = ['Programming Language :: Python', 'Programming Language :: Python :: 2',