Huggingface 添加了一个fine-tuning with custom datasets 指南,其中包含很多有用的信息。我能够使用 IMDB sequence classification 部分中的信息成功地使用带有我自己的 pandas 数据框的胶水数据集调整笔记本。
from transformers import (
AutoConfig,
AutoTokenizer,
TFAutoModelForSequenceClassification,
AdamW
)
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
df = pd.read_pickle('data.pkl')
train_texts = df.text.values # an array of strings
train_labels = df.label.values # an array of integers
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
train_encodings = tokenizer(train_texts.tolist(), truncation=True, max_length=96, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, max_length=96, padding=True)
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
dict(val_encodings),
val_labels
))
num_labels = 3
num_train_examples = len(train_dataset)
num_dev_examples = len(val_dataset)
train_dataset = train_dataset.shuffle(100).batch(train_batch_size)
val_dataset = val_dataset.shuffle(100).batch(eval_batch_size)
learning_rate = 2e-5
train_batch_size = 8
eval_batch_size = 8
num_epochs = 1
train_steps_per_epoch = int(num_train_examples / train_batch_size)
dev_steps_per_epoch = int(num_dev_examples / eval_batch_size)
config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=config)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
history = model.fit(train_dataset,
epochs=num_epochs,
steps_per_epoch=train_steps_per_epoch,
validation_data=val_dataset,
validation_steps=dev_steps_per_epoch)
笔记本学分:digitalepidemiologylab covid-twitter-bert colab