""" @author: luojunhui """ import pandas as pd from sklearn.model_selection import train_test_split from transformers import BertTokenizer # 加载数据 df = pd.read_csv("festival_data.csv") texts = df['text'].tolist() labels = df['label'].tolist() # 划分数据集 train_texts, val_texts, train_labels, val_labels = train_test_split( texts, labels, test_size=0.2, stratify=labels) # 保证类别平衡[1,4](@ref) # 初始化分词器 tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') # 文本编码函数 def encode_texts(text_list, max_len=32): return tokenizer( text_list, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt' ) # 编码训练集/验证集 train_encodings = encode_texts(train_texts) val_encodings = encode_texts(val_texts) # 创建PyTorch数据集 import torch from torch.utils.data import Dataset, DataLoader class FestivalDataset(Dataset): def __init__(self, encodings, labels): self.input_ids = encodings['input_ids'] self.attention_mask = encodings['attention_mask'] self.labels = torch.tensor(labels) def __getitem__(self, idx): return { 'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], 'labels': self.labels[idx] } def __len__(self): return len(self.labels) train_dataset = FestivalDataset(train_encodings, train_labels) val_dataset = FestivalDataset(val_encodings, val_labels) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=64) from transformers import BertForSequenceClassification, AdamW import torch.nn as nn # 加载预训练模型 model = BertForSequenceClassification.from_pretrained( 'bert-base-chinese', num_labels=1, # 二分类输出单个logit节点[2,6](@ref) problem_type="single_label_classification" ) # 定义训练参数 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) optimizer = AdamW(model.parameters(), lr=2e-5) loss_fn = nn.BCEWithLogitsLoss() # 适用于二分类 # 训练循环 for epoch in range(5): model.train() total_loss = 0 for batch in train_loader: optimizer.zero_grad() inputs = {k: v.to(device) for k, v in batch.items()} outputs = model(**inputs) logits = outputs.logits.squeeze() loss = loss_fn(logits, inputs['labels'].float()) loss.backward() optimizer.step() total_loss += loss.item() print(f"Epoch {epoch + 1} | Train Loss: {total_loss / len(train_loader):.4f}") # 验证阶段 model.eval() val_loss = 0 with torch.no_grad(): for batch in val_loader: inputs = {k: v.to(device) for k, v in batch.items()} outputs = model(**inputs) logits = outputs.logits.squeeze() loss = loss_fn(logits, inputs['labels'].float()) val_loss += loss.item() print(f"Epoch {epoch + 1} | Val Loss: {val_loss / len(val_loader):.4f}") # 保存完整模型 torch.save(model, "festival_bert_model.pth")