跳转至

03 - Hugging Face

学习时间: 8-10 小时 重要性: ⭐⭐⭐⭐⭐ NLP 和 Transformer 模型的标准工具


🎯 学习目标

  • 理解 Transformer 架构原理
  • 掌握 Tokenizer 的使用
  • 学会使用预训练模型
  • 掌握模型微调方法
  • 了解不同 NLP 任务的实现
  • 学会模型优化和部署

📚 内容概览

  1. Transformer 架构原理
  2. Tokenizer 深入
  3. 模型使用
  4. 微调完整流程
  5. 不同 NLP 任务
  6. 模型优化与部署

1. Transformer 架构原理

1.1 自注意力机制

Python
import torch
import torch.nn as nn
import math

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()  # super()调用父类方法,常用于继承中的初始化
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert embed_dim % num_heads == 0, "embed_dim必须能被num_heads整除"  # assert断言条件为真,否则抛出AssertionError

        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, embed_dim = x.shape

        # 生成Q, K, V
        qkv = self.qkv(x).reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, batch, heads, seq, head_dim)
        q, k, v = qkv[0], qkv[1], qkv[2]

        # 计算注意力分数
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # 应用mask(用于因果注意力或padding)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        # Softmax
        attn_weights = torch.softmax(scores, dim=-1)

        # 加权求和
        attn_output = torch.matmul(attn_weights, v)

        # 合并多头
        attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, embed_dim)

        return self.proj(attn_output)

# 测试
x = torch.randn(2, 10, 512)  # (batch, seq_len, embed_dim)
attn = SelfAttention(embed_dim=512, num_heads=8)
output = attn(x)
print(output.shape)  # (2, 10, 512)

1.2 Transformer 编码器

Python
class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = SelfAttention(embed_dim, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, embed_dim)
        )
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # 自注意力子层
        attn_output = self.self_attn(x, mask)
        x = self.norm1(x + self.dropout(attn_output))

        # 前馈子层
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))

        return x

class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(embed_dim, num_heads, ff_dim, dropout)
            for _ in range(num_layers)
        ])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return x

1.3 Transformer 变体

Python
# BERT (双向编码器)
# - 使用Transformer编码器
# - 双向注意力(可以看到整个序列)
# - 预训练任务: MLM (Masked Language Modeling) + NSP (Next Sentence Prediction)

# GPT (生成式预训练)
# - 使用Transformer解码器
# - 因果注意力(只能看到前面的token)
# - 预训练任务: 自回归语言建模

# T5 (Text-to-Text Transfer Transformer)
# - 编码器-解码器结构
# - 所有任务都转换为文本到文本的格式

# 从Hugging Face加载不同模型
from transformers import (
    BertModel, GPT2Model, T5Model,
    BertTokenizer, GPT2Tokenizer, T5Tokenizer
)

# BERT
bert = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# GPT-2
gpt2 = GPT2Model.from_pretrained('gpt2')
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# T5
t5 = T5Model.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

2. Tokenizer 深入

2.1 分词算法

Python
# BPE (Byte Pair Encoding)
# WordPiece
# SentencePiece
# Unigram

from transformers import (
    BertTokenizer,      # WordPiece
    GPT2Tokenizer,      # BPE
    T5Tokenizer,        # SentencePiece
    XLNetTokenizer,     # SentencePiece
    AutoTokenizer
)

# 自动选择对应的Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

2.2 Tokenizer 使用

Python
from transformers import AutoTokenizer

# 加载Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

# 基本使用
text = "你好,世界!"
tokens = tokenizer.tokenize(text)
print(tokens)  # ['你', '好', ',', '世', '界', '!']

# 编码(转换为ID)
encoded = tokenizer.encode(text)
print(encoded)  # [101, 872, 1962, 8024, 686, 4518, 511, 102]

# 完整编码(返回更多信息的字典)
encoded = tokenizer(
    text,
    padding=True,           # 填充
    truncation=True,        # 截断
    max_length=512,         # 最大长度
    return_tensors='pt'     # 返回PyTorch张量
)
print(encoded)
# {'input_ids': tensor([[...]]), 'attention_mask': tensor([[...]])}

# 批量编码
texts = ["第一句话", "第二句话", "第三句话"]
encoded = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)
print(encoded['input_ids'].shape)  # (3, seq_len)

# 解码
ids = [101, 872, 1962, 102]
decoded = tokenizer.decode(ids)
print(decoded)  # [CLS] 你好 [SEP]

# 跳过特殊token
decoded = tokenizer.decode(ids, skip_special_tokens=True)
print(decoded)  # 你好

2.3 特殊 Token

Python
# 查看特殊token
print(tokenizer.special_tokens_map)
# {
#     'cls_token': '[CLS]',
#     'sep_token': '[SEP]',
#     'pad_token': '[PAD]',
#     'unk_token': '[UNK]',
#     'mask_token': '[MASK]'
# }

# 获取特殊token的ID
cls_token_id = tokenizer.cls_token_id
sep_token_id = tokenizer.sep_token_id
pad_token_id = tokenizer.pad_token_id

# 添加新token
new_tokens = ['[NEW_TOKEN_1]', '[NEW_TOKEN_2]']
tokenizer.add_tokens(new_tokens)

# 添加特殊token
special_tokens = {'cls_token': '[MY_CLS]', 'sep_token': '[MY_SEP]'}
tokenizer.add_special_tokens(special_tokens)

# 调整模型嵌入层大小(添加新token后)
model.resize_token_embeddings(len(tokenizer))

2.4 处理句子对

Python
# 处理两个句子(用于NSP等任务)
sentence1 = "今天天气很好"
sentence2 = "适合出去散步"

encoded = tokenizer(
    sentence1,
    sentence2,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

# 输出包含token type ids(区分两个句子)
print(encoded['input_ids'])
print(encoded['token_type_ids'])  # 0表示第一句,1表示第二句

# 手动构建句子对
encoded = tokenizer(
    f"{sentence1} [SEP] {sentence2}",
    padding=True,
    truncation=True,
    return_tensors='pt'
)

3. 模型使用

3.1 AutoModel 体系

Python
from transformers import (
    AutoModel, AutoModelForSequenceClassification,
    AutoModelForTokenClassification, AutoModelForQuestionAnswering,
    AutoModelForCausalLM, AutoModelForSeq2SeqLM
)

# 基础模型(只输出隐藏状态)
model = AutoModel.from_pretrained('bert-base-chinese')

# 用于分类任务(带分类头)
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=2  # 类别数
)

# 用于token级别分类(如NER)
model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=9  # BIO标签数
)

# 用于问答任务
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-chinese')

# 用于文本生成(因果语言模型)
model = AutoModelForCausalLM.from_pretrained('gpt2')

# 用于序列到序列任务(如翻译、摘要)
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

3.2 模型推理

Python
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 加载模型和tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese')

# 准备输入
text = "这是一个很好的产品"
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

# 推理
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# 获取logits
logits = outputs.logits
print(logits.shape)  # (batch_size, num_labels)

# 获取预测
predictions = torch.argmax(logits, dim=-1)
print(predictions)

# 获取概率
probs = torch.softmax(logits, dim=-1)
print(probs)

# 获取隐藏状态
outputs = model(**inputs, output_hidden_states=True)
hidden_states = outputs.hidden_states  # 13层(embedding + 12层transformer)
print(len(hidden_states))
print(hidden_states[-1].shape)  # (batch, seq_len, hidden_size)

# 获取注意力权重
outputs = model(**inputs, output_attentions=True)
attentions = outputs.attentions  # 12层注意力权重
print(len(attentions))
print(attentions[0].shape)  # (batch, num_heads, seq_len, seq_len)

3.3 文本生成

Python
from transformers import AutoTokenizer, AutoModelForCausalLM

# 加载生成模型
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')

# 生成文本
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors='pt')

# 生成
outputs = model.generate(
    **inputs,
    max_length=100,           # 最大生成长度
    num_return_sequences=3,   # 生成3个候选
    temperature=0.7,          # 温度(控制随机性)
    top_k=50,                 # Top-k采样
    top_p=0.95,               # Nucleus sampling
    do_sample=True,           # 使用采样而不是贪心
    repetition_penalty=1.2,   # 重复惩罚
    pad_token_id=tokenizer.eos_token_id
)

# 解码
generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
for i, text in enumerate(generated_texts):
    print(f"Generated {i+1}:\n{text}\n")

4. 微调完整流程

4.1 数据准备

Python
from datasets import load_dataset, Dataset
import pandas as pd

# 加载数据集
dataset = load_dataset('imdb')  # 情感分析数据集
print(dataset)
# DatasetDict({
#     train: Dataset({
#         features: ['text', 'label'],
#         num_rows: 25000
#     })
#     test: Dataset({
#         features: ['text', 'label'],
#         num_rows: 25000
#     })
# })

# 从pandas创建数据集
df = pd.DataFrame({
    'text': ['很好', '不好', '一般'],
    'label': [1, 0, 2]
})
dataset = Dataset.from_pandas(df)

# 从本地文件加载
dataset = load_dataset('csv', data_files='data.csv')

# 数据预处理
def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=512
    )

encoded_dataset = dataset.map(preprocess_function, batched=True)

4.2 使用 Trainer API

Python
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np
from datasets import load_dataset
import evaluate

# 加载数据
dataset = load_dataset('imdb')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# 预处理
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

# 数据整理器(动态padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

# 训练参数
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=10,
)

# 评估指标
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'].shuffle(seed=42).select(range(1000)),
    eval_dataset=encoded_dataset['test'].shuffle(seed=42).select(range(500)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 训练
trainer.train()

# 评估
trainer.evaluate()

# 保存模型
trainer.save_model('./my_model')

4.3 自定义训练循环

Python
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

# 准备数据
train_dataloader = DataLoader(
    encoded_dataset['train'],
    batch_size=16,
    shuffle=True,
    collate_fn=data_collator
)

# 优化器和学习率调度
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# 训练循环
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}')

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

5. 不同 NLP 任务

5.1 文本分类

Python
from transformers import pipeline

# 使用pipeline快速实现
classifier = pipeline(
    'sentiment-analysis',
    model='distilbert-base-uncased-finetuned-sst-2-english'
)

result = classifier("I love this product!")
print(result)
# [{'label': 'POSITIVE', 'score': 0.9998}]

# 多标签分类
classifier = pipeline(
    'text-classification',
    model='facebook/bart-large-mnli'
)

result = classifier(
    "I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"]
)
print(result)

5.2 命名实体识别 (NER)

Python
# 使用pipeline
ner_pipeline = pipeline(
    'ner',
    model='dslim/bert-base-NER',
    aggregation_strategy='simple'
)

text = "Apple is looking at buying U.K. startup for $1 billion"
entities = ner_pipeline(text)
for entity in entities:
    print(f"{entity['word']}: {entity['entity_group']} ({entity['score']:.4f})")

# 微调NER模型
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-cased',
    num_labels=9  # BIO标签: O, B-PER, I-PER, B-ORG, I-ORG, B-LOC, I-LOC, B-MISC, I-MISC
)

5.3 问答系统

Python
# 抽取式问答
qa_pipeline = pipeline(
    'question-answering',
    model='distilbert-base-cased-distilled-squad'
)

context = """
The Transformers library provides state-of-the-art general-purpose architectures
for Natural Language Understanding (NLU) and Natural Language Generation (NLG).
"""
question = "What does the Transformers library provide?"

result = qa_pipeline(question=question, context=context)
print(result)
# {'score': 0.99, 'start': 34, 'end': 95, 'answer': 'state-of-the-art general-purpose architectures'}

# 生成式问答(使用T5)
generator = pipeline('text2text-generation', model='google/t5-small-ssm-nq')

5.4 文本摘要

Python
# 使用pipeline
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')

text = """
Your long text here... (at least 100 words)
"""

summary = summarizer(
    text,
    max_length=130,
    min_length=30,
    do_sample=False
)
print(summary[0]['summary_text'])

# 使用T5
t5_summarizer = pipeline(
    'text2text-generation',
    model='t5-small'
)

result = t5_summarizer(
    "summarize: " + text,
    max_length=100,
    num_return_sequences=1
)

5.5 机器翻译

Python
# 使用pipeline
translator = pipeline(
    'translation_en_to_de',
    model='t5-small'
)

result = translator("Hello, how are you?")
print(result)

# 使用MarianMT
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-de'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors='pt', padding=True)
translated = model.generate(**inputs)
result = tokenizer.decode(translated[0], skip_special_tokens=True)
print(result)

5.6 文本生成

Python
# GPT-2生成
generator = pipeline('text-generation', model='gpt2')

prompt = "The future of AI is"
result = generator(
    prompt,
    max_length=100,
    num_return_sequences=3,
    temperature=0.8
)

for i, res in enumerate(result):
    print(f"Generation {i+1}:\n{res['generated_text']}\n")

6. 模型优化与部署

6.1 模型量化

Python
from transformers import AutoModelForSequenceClassification
import torch

# 动态量化
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
quantized_model = torch.quantization.quantize_dynamic(
    model,
    {torch.nn.Linear},
    dtype=torch.qint8
)

# 保存量化模型
torch.save(quantized_model.state_dict(), 'quantized_model.pt')

# 使用Optimum进行优化
from optimum.onnxruntime import ORTModelForSequenceClassification

model = ORTModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    export=True
)

6.2 ONNX 导出

Python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# 加载模型
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

# 准备示例输入
text = "This is a sample text"
inputs = tokenizer(text, return_tensors='pt')

# 导出为ONNX
torch.onnx.export(
    model,
    (inputs['input_ids'], inputs['attention_mask']),
    'model.onnx',
    input_names=['input_ids', 'attention_mask'],
    output_names=['output'],
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'sequence'},
        'attention_mask': {0: 'batch_size', 1: 'sequence'},
        'output': {0: 'batch_size'}
    },
    opset_version=11
)

6.3 模型推理优化

Python
# 使用TorchScript
from transformers import AutoModel

model = AutoModel.from_pretrained('bert-base-uncased')
model.eval()

# 追踪模型
example_inputs = tokenizer("Example text", return_tensors='pt')
traced_model = torch.jit.trace(
    model,
    (example_inputs['input_ids'], example_inputs['attention_mask'])
)

# 保存
traced_model.save('traced_model.pt')

# 使用ONNX Runtime
import onnxruntime as ort

session = ort.InferenceSession('model.onnx')
inputs = {
    'input_ids': inputs['input_ids'].numpy(),
    'attention_mask': inputs['attention_mask'].numpy()
}
outputs = session.run(None, inputs)

6.4 部署为 API

Python
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline

app = FastAPI()

# 加载模型
classifier = pipeline('sentiment-analysis')

class TextInput(BaseModel):
    text: str

class PredictionOutput(BaseModel):
    label: str
    score: float

@app.post('/predict', response_model=PredictionOutput)
def predict(input: TextInput):
    result = classifier(input.text)[0]
    return PredictionOutput(label=result['label'], score=result['score'])

@app.get('/health')
def health():
    return {'status': 'healthy'}

# 运行: uvicorn api:app --reload

📝 练习

练习 1: Tokenizer 实践

Python
# 1. 加载一个中文BERT模型的Tokenizer
# 2. 对一段中文文本进行tokenize
# 3. 查看特殊token和它们的ID
# 4. 添加自定义token并调整模型

练习 2: 模型推理

Python
# 1. 加载一个预训练的文本分类模型
# 2. 对多个文本进行批量推理
# 3. 获取预测概率和隐藏状态
# 4. 可视化注意力权重

练习 3: 模型微调

Python
# 1. 选择一个文本分类数据集
# 2. 使用Trainer API微调BERT模型
# 3. 评估模型性能
# 4. 保存并加载微调后的模型

练习 4: 完整 NLP 项目

Python
# 1. 选择一个NLP任务(如情感分析、NER等)
# 2. 准备数据集
# 3. 微调预训练模型
# 4. 评估和优化
# 5. 部署为API服务

🎯 自我检查

  • 理解 Transformer 架构和自注意力机制
  • 掌握 Tokenizer 的使用方法
  • 能加载和使用预训练模型
  • 掌握模型微调流程
  • 能实现不同 NLP 任务
  • 了解模型优化和部署方法

📚 延伸阅读


下一步: 04 - 实战项目