LibMultiLabel-mlgroup-Old-Archive/docs/examples/plot_bert_quickstart.py at f38e59130db43c441228fb45a16535ba8d26bb25 · ntumlgroup/LibMultiLabel-mlgroup-Old-Archive · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
BERT Model for Multi-label Classification
=========================================

This step-by-step example shows how to train and test a BERT model via LibMultiLabel.


Import the libraries
----------------------------

Please add the following code to your python3 script.
"""

from libmultilabel.nn.data_utils import *
from libmultilabel.nn.nn_utils import *
from transformers import AutoTokenizer

######################################################################
# Setup device
# --------------------
# If you need to reproduce the results, please use the function ``set_seed``.
# For example, you will get the same result as you always use the seed ``1337``.
#
# For initial a hardware device, please use ``init_device`` to assign the hardware device that you want to use.

set_seed(1337)
device = init_device()  # use gpu by default

######################################################################
# Load and tokenize data
# ------------------------------------------
# We assume that the ``rcv1`` data is located at the directory ``./data/rcv1``,
# and there exist the files ``train.txt`` and ``test.txt``.
# You can utilize the function ``load_datasets()`` to load the data sets.
# By default, LibMultiLabel tokenizes documents, but the BERT model uses its own tokenizer.
# Thus, we must set ``tokenize_text=False``.
# Note that ``datasets`` contains three sets: ``datasets['train']``, ``datasets['val']`` and ``datasets['test']``,
# where ``datasets['train']`` and ``datasets['val']`` are randomly splitted from ``train.txt`` with the ratio ``8:2``.
#
# For the labels of the data, we apply the function ``load_or_build_label()`` to generate the label set.
#
# For BERT, we utilize the API ``AutoTokenizer``, which is supported by ``Hugging Face``, for the word preprocessing setting.
# We set other variables for word preprocessing as ``None``.

datasets = load_datasets("data/rcv1/train.txt", "data/rcv1/test.txt", tokenize_text=False)
classes = load_or_build_label(datasets)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

######################################################################
# Initialize a model
# --------------------------
#
# We use the following code to initialize a model.

model_name = "BERT"
network_config = {
    "encoder_hidden_dropout": 0.1,
    "lm_weight": "bert-base-uncased",
}
learning_rate = 0.00003
model = init_model(
    model_name=model_name,
    network_config=network_config,
    classes=classes,
    learning_rate=learning_rate,
    monitor_metrics=["Micro-F1", "Macro-F1", "P@1", "P@3", "P@5"],
)

######################################################################
# * ``model_name`` leads ``init_model`` function to find a network model.
# * ``network_config`` contains the configurations of a network model.
# * ``classes`` is the label set of the data.
# * ``init_weight``, ``word_dict`` and ``embed_vecs`` are not used on a bert-base model, so we can ignore them.
# * ``moniter_metrics`` includes metrics you would like to track.
#
#
# Initialize a trainer
# ----------------------------
#
# We use the function ``init_trainer`` to initialize a trainer.

trainer = init_trainer(checkpoint_dir="runs/NN-example", epochs=15, val_metric="P@5")

######################################################################
# In this example, ``checkpoint_dir`` is the place we save the best and the last models during the training. Furthermore, we set the number of training loops by ``epochs=15``, and the validation metric by ``val_metric='P@5'``.
#
# Create data loaders
# ---------------------------
#
# In most cases, we do not load a full set due to the hardware limitation.
# Therefore, a data loader can load a batch of samples each time.

loaders = dict()
for split in ["train", "val", "test"]:
    loaders[split] = get_dataset_loader(
        data=datasets[split],
        classes=classes,
        device=device,
        max_seq_length=512,
        batch_size=8,
        shuffle=True if split == "train" else False,
        tokenizer=tokenizer,
    )

######################################################################
# This example loads three loaders, and the batch size is set by ``batch_size=8``. Other variables can be checked in `here <../api/nn.html#libmultilabel.nn.data_utils.get_dataset_loader>`_.
#
# Train and test a model
# ------------------------------
#
# The bert model training process can be started via

trainer.fit(model, loaders["train"], loaders["val"])

######################################################################
# After the training process is finished, we can then run the test process by

trainer.test(model, dataloaders=loaders["test"])

######################################################################
# The results should be similar to::
#
#  {
#      'Macro-F1': 0.569891024909958,
#      'Micro-F1': 0.8142925500869751,
#      'P@1':      0.9552904367446899,
#      'P@3':      0.7907078266143799,
#      'P@5':      0.5505486726760864
#  }