文本分类模型微调

介绍

基于BERT（Bidirectional Encoder Representations from Transformers）的语义相似度算法是一种强大的自然语言处理工具，它能够理解文本之间的深层语义关系。它通过双向Transformer编码器来预训练深层的语言表示。

实现语义相似度算法的步骤：

预训练BERT模型：首先需要在一个大规模的文本语料库上预训练BERT模型，使其学习语言的通用表示。

微调：在特定的任务数据集上对预训练的BERT模型进行微调，以适应特定的应用场景。

提取特征：使用微调后的模型提取文本的特征向量。

计算相似度：通过比较这些特征向量来计算文本之间的语义相似度。

环境准备

安装 huggingface 库

pip install transformers datasets

下载预训练模型

一般使用中文的 bert 模型（bert-base-chinese）

from transformers import BertTokenizer, BertForSequenceClassification

#替换为你选择的模型名称
model_name = "bert-base-chinese"
#指定模型保存路径
cache_dir = "./model_cache"

#下载并加载模型和分词器到指定文件夹
model = BertForSequenceClassification.from_pretrained(model_name, cache_dir=cache_dir)
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

下载微调使用的数据集

huggingface 筛选文本分类数据集：在左侧筛选栏里选择 tasks，然后选择 text-classification（注意这不是必须的，有些文本分类数据集没有被打上 text-classification 标签，筛选了反而搜索不到）
下面使用 weibo 的数据集

from datasets import load_dataset

# 加载在线的数据集
dataset = load_dataset(path="kuroneko5943/weibo16" cache_dir=r"cache_dataset/")
# 缓存数据到本地
dataset.save_to_disk(dataset_path="cache_dataset/mydata/")

数据集会有两个重要的部分，一个是文本，一个是文本对应的分类值。一般会如下，但是不同数据集 key 会不一样，在后面代码中需要动态调整
- label：分类值
- text：文本

label,text
1,崩溃了。
7,文革之后还能保留这个习俗也算异数了。
7,其实更好。

训练

1、自定义训练集类

split入参是要传入数据集类型，一般类型分为 训练数据集、验证数据集、测试数据集（有的数据集可能只有某一种类别的数据集，看情况下 MyDataset 类）
__getitem__() 方法需要返回使用数据集的文本和分类值，根据模型对应 key 返回就行

from torch.utils.data import Dataset
from datasets import load_dataset

class MyDataset(Dataset):
    def __init__(self, split):
        self.dataset = load_dataset(path="csv", data_files=f"data/Weibo/{split}.csv", split=split)
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, item):
        text = self.dataset[item]["text"] # 文本
        label = self.dataset[item]["label"] # 分类值
        return text,label

2、定义下游任务类

torch.nn.Linear(768, 8)：将模型的 768 维特征转换为数据集的类别数特征
- 768 是 bert-base-chinese 模型的 out_features，可以通过 print(pretrained) 看到使用模型的 out_features
- 8是数据集分类的种类数，示例使用的 weibo 数据集会将文本分为8种类别
forward
- input_ids：这是输入文本转成词向量后的 id 数据，即调用模型的token.batch_encode_plus进行文本 -> 词向量 后的结果
- attention_mask：这是标出词向量哪一部分是有用。这是因为每一个文本都会转换为相同长度的词向量，但是文本不是同样长的，不够长的文本被转换为词向量后剩余部分会用 0 补齐。attention_mask就是标记非 0 部分的词向量

from transformers import BertModel
import torch

# 模型加载根据环境动态选择 cpu 和 gpu
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 加载模型
pretrained = BertModel.from_pretrained(r"xxx\models--bert-base-chinese\snapshots\c30a6ed22ab4564dc1e3b2ecbf6e766b0611a33f").to(DEVICE)

#定义下游任务（将主干网络所提取的特征进行分类）
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 768维特征 -> 2分类
        self.fc = torch.nn.Linear(768, 8)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad(): # 冻结BERT权重
            out = pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        out = self.fc(out.last_hidden_state[:,0]) # 取[CLS]标记的输出
        out = out.softmax(dim=1) # 概率分布
        return out

3、训练

如果有验证数据集，最好可以在训练完一轮后，使用验证集进行验证，当验证的评分大于历史最好评分（或者验证损失值小于历史最小损失值，二者去一个去验证就行），才保存这轮结果。不然可能会训练过拟合，训练效果反而不好

from torch.utils.data import Dataset, DataLoader
from datasets import load_from_disk
from transformers import BertModel, BertTokenizer
from torch.optim import AdamW
import torch
from MyData import MyDataset
from net import Model

# 训练轮数
EPOCH = 100
# 检测使用GPU还是CPU
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载预训练模型
pretrained = BertModel.from_pretrained(r"/xxxl/models--bert-base-chinese/snapshots/8f23c25b06e129b6c986331a13d8d025a92cf0ea").to(DEVICE)
# 加载tokenizer
token = BertTokenizer.from_pretrained(r"/xxx/models--bert-base-chinese/snapshots/8f23c25b06e129b6c986331a13d8d025a92cf0e")
    
# 数据批处理函数
def collate_fn(data):
    sentes = [i[0] for i in data]
    label = [i[1] for i in data]
    # 对句进行编码成id，
    data = token.batch_encode_plus(batch_text_or_text_pairs=sentes,
                            truncation=True,
                            padding="max_length",
                            max_length=500,
                            return_tensors="pt",
                            return_length=True)
    input_ids = data["input_ids"]
    attention_mask = data["attention_mask"]
    token_type_ids = data["token_type_ids"]
    labels = torch.LongTensor(label)
    return input_ids,attention_mask,token_type_ids,labels
#创建数据集
train_dataset = MyDataset("train") # 训练数据集
val_dataset = MyDataset("validation") # 验证数据集
#创建dataloader
train_laoder = DataLoader(dataset=train_dataset,
                          batch_size=100,
                          shuffle=True,
                          drop_last=True,
                          collate_fn=collate_fn)


model = Model().to(DEVICE)
optimizer = AdamW(model.parameters(),lr=5e-4)
loss_func = torch.nn.CrossEntropyLoss()

# 训练循环 
for epoch in range(EPOCH):
    # 训练
    model.train()
    for i,(input_ids,attention_mask,token_type_ids,labels) in enumerate(train_laoder):
        input_ids, attention_mask, token_type_ids, labels = input_ids.to(DEVICE), attention_mask.to(
            DEVICE), token_type_ids.to(DEVICE), labels.to(DEVICE)
        
        # 前向传播
        out = model(input_ids,attention_mask,token_type_ids)
        # 计算损失
        loss = loss_func(out,labels)

        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i%5==0:
            out = out.argmax(dim=1)
            acc = (out==labels).sum().item()/len(labels)
            print(epoch,i,loss.item(),acc)
            
    #模型验证
    model.eval()
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(val_loader):
        input_ids, attention_mask, token_type_ids, labels = input_ids.to(DEVICE), attention_mask.to(
            DEVICE), token_type_ids.to(DEVICE), labels.to(DEVICE)
        out = model(input_ids, attention_mask, token_type_ids)

        loss = loss_func(out, labels)

        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)
        sum_val_loss = sum_val_loss + loss
        sum_val_acc = sum_val_acc + accuracy
    avg_val_loss = sum_val_loss / len(val_loader)
    avg_val_acc = sum_val_acc / len(val_loader)
    print(f"val==>epoch:{epoch},avg_val_loss:{avg_val_loss}, avg_val_acc:{avg_val_acc}")

    # 只有当评分大于历史最好评分，才保存数据结果（验证集的作用）
    #if avg_val_acc > best_avg_val_acc:
    #   torch.save()
    #   best_avg_val_acc = avg_val_acc
    torch.save(model.state_dict(),f"cache_dataset/train_data/{epoch}bert01.pth")
    print(epoch,"参数保存成功！")

、使用训练数据，对用户输入进行分类

names = ["like","disgust","happiness","sadness","anger","surprise","fear","none"]

def test():
    model.load_state_dict(torch.load("cache_dataset/train_data/0bert01.pth"))
    model.eval()
    while True:
        data = input("请输入测试数据(输入'q'退出)：")
        if data == "q":
            print("测试结束")
            break
        input_ids, attention_mask, token_type_ids= collate_fn(data)
        input_ids, attention_mask, token_type_ids = input_ids.to(DEVICE), attention_mask.to(
            DEVICE), token_type_ids.to(DEVICE)

        with torch.no_grad():
            out = model(input_ids,attention_mask,token_type_ids)
            out = out.argmax(dim=1)
            print("模型判定：",names[out],"\n")
            
test()