123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112 |
- """
- @author: luojunhui
- """
- import pandas as pd
- from sklearn.model_selection import train_test_split
- from transformers import BertTokenizer
- # 加载数据
- df = pd.read_csv("festival_data.csv")
- texts = df['text'].tolist()
- labels = df['label'].tolist()
- # 划分数据集
- train_texts, val_texts, train_labels, val_labels = train_test_split(
- texts, labels, test_size=0.2, stratify=labels) # 保证类别平衡[1,4](@ref)
- # 初始化分词器
- tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
- # 文本编码函数
- def encode_texts(text_list, max_len=32):
- return tokenizer(
- text_list,
- padding='max_length',
- truncation=True,
- max_length=max_len,
- return_tensors='pt'
- )
- # 编码训练集/验证集
- train_encodings = encode_texts(train_texts)
- val_encodings = encode_texts(val_texts)
- # 创建PyTorch数据集
- import torch
- from torch.utils.data import Dataset, DataLoader
- class FestivalDataset(Dataset):
- def __init__(self, encodings, labels):
- self.input_ids = encodings['input_ids']
- self.attention_mask = encodings['attention_mask']
- self.labels = torch.tensor(labels)
- def __getitem__(self, idx):
- return {
- 'input_ids': self.input_ids[idx],
- 'attention_mask': self.attention_mask[idx],
- 'labels': self.labels[idx]
- }
- def __len__(self):
- return len(self.labels)
- train_dataset = FestivalDataset(train_encodings, train_labels)
- val_dataset = FestivalDataset(val_encodings, val_labels)
- # 创建数据加载器
- train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
- val_loader = DataLoader(val_dataset, batch_size=64)
- from transformers import BertForSequenceClassification, AdamW
- import torch.nn as nn
- # 加载预训练模型
- model = BertForSequenceClassification.from_pretrained(
- 'bert-base-chinese',
- num_labels=1, # 二分类输出单个logit节点[2,6](@ref)
- problem_type="single_label_classification"
- )
- # 定义训练参数
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
- model.to(device)
- optimizer = AdamW(model.parameters(), lr=2e-5)
- loss_fn = nn.BCEWithLogitsLoss() # 适用于二分类
- # 训练循环
- for epoch in range(5):
- model.train()
- total_loss = 0
- for batch in train_loader:
- optimizer.zero_grad()
- inputs = {k: v.to(device) for k, v in batch.items()}
- outputs = model(**inputs)
- logits = outputs.logits.squeeze()
- loss = loss_fn(logits, inputs['labels'].float())
- loss.backward()
- optimizer.step()
- total_loss += loss.item()
- print(f"Epoch {epoch + 1} | Train Loss: {total_loss / len(train_loader):.4f}")
- # 验证阶段
- model.eval()
- val_loss = 0
- with torch.no_grad():
- for batch in val_loader:
- inputs = {k: v.to(device) for k, v in batch.items()}
- outputs = model(**inputs)
- logits = outputs.logits.squeeze()
- loss = loss_fn(logits, inputs['labels'].float())
- val_loss += loss.item()
- print(f"Epoch {epoch + 1} | Val Loss: {val_loss / len(val_loader):.4f}")
- # 保存完整模型
- torch.save(model, "festival_bert_model.pth")
|