import torch
import torch.nn as nn
from transformers import AutoModel, AutoModelForSequenceClassification
class CustomBaseModel(nn.Module):
def __init__(self, model_name: str, num_classes: int = 3, dr_rate: float = 0.2):
super(CustomBaseModel, self).__init__()
self.num_classes = num_classes
self.dr_rate = dr_rate
self.model = AutoModel.from_pretrained(model_name)
self.classifier = nn.Sequential(
nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size),
nn.GELU(),
nn.Dropout(p = self.dr_rate),
nn.Linear(self.model.config.hidden_size, num_classes)
)
def forward(self, x):
...
model = CustomBaseModel(model_name = "beomi/KcELECTRA-base-v2022").to('cuda')
model
BaseModel(
(model): ElectraModel(
(embeddings): ElectraEmbeddings(
...
)
)
)
)
)
(classifier): Sequential(
(0): Linear(in_features=768, out_features=768, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.2, inplace=False)
(3): Linear(in_features=768, out_features=3, bias=True)
)
)
ElectraForSequenceClassification(
(electra): ElectraModel(
(embeddings): ElectraEmbeddings(
...
)
)
)
)
)
(classifier): ElectraClassificationHead(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): GELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
(out_proj): Linear(in_features=768, out_features=3, bias=True)
)
)
# line 929 in transformers.model.electra.modeling_electra.py
from .activations import get_activation
...
class ElectraClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x)
x = self.dense(x)
x = get_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here
x = self.dropout(x)
x = self.out_proj(x)
return x
# line 60 in transformers.activations.py
class GELUActivation(nn.Module):
"""
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""
def __init__(self, use_gelu_python: bool = False):
super().__init__()
if use_gelu_python:
self.act = self._gelu_python
else:
self.act = nn.functional.gelu
def _gelu_python(self, input: Tensor) -> Tensor:
return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0)))
def forward(self, input: Tensor) -> Tensor:
return self.act(input)
# line 209 in transformers.activations.py
ACT2CLS = {
"gelu": GELUActivation,
"gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
"gelu_fast": FastGELUActivation,
"gelu_new": NewGELUActivation,
"gelu_python": (GELUActivation, {"use_gelu_python": True}),
"gelu_pytorch_tanh": PytorchGELUTanh,
"gelu_accurate": AccurateGELUActivation,
...
}
ACT2FN = ClassInstantier(ACT2CLS)
def get_activation(activation_string):
if activation_string in ACT2FN:
return ACT2FN[activation_string]
else:
raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")