|
|
@@ -0,0 +1,1027 @@
|
|
|
+{
|
|
|
+ "source": {
|
|
|
+ "platform": "xhs",
|
|
|
+ "author": "xhs博主",
|
|
|
+ "date": "2026-04-27",
|
|
|
+ "url": "https://www.xiaohongshu.com/explore/69ee467400000000220289e2",
|
|
|
+ "title": "大模型原理图解,从Prompt到第一个Token",
|
|
|
+ "excerpt": "用大白话图解LLM推理全流程,从用户Prompt文本到生成第一个Token,拆解为预处理→Embedding→Transformer前向→LM Head→解码决策五阶段,配9张详细图解。"
|
|
|
+ },
|
|
|
+ "procedures": [
|
|
|
+ {
|
|
|
+ "id": "p1",
|
|
|
+ "name": "LLM推理_从Prompt到第一个Token",
|
|
|
+ "purpose": "给定用户Prompt,经分词→Embedding→Transformer多层前向→LM Head→解码采样,生成第一个输出Token",
|
|
|
+ "category": "学习",
|
|
|
+ "platform": "xhs",
|
|
|
+ "author": "xhs博主",
|
|
|
+ "declarations": {
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "提示词",
|
|
|
+ "name": "用户Prompt",
|
|
|
+ "desc": "用户输入的自然语言指令文本"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "resources": [
|
|
|
+ {
|
|
|
+ "type": "模型权重",
|
|
|
+ "name": "LLM模型参数",
|
|
|
+ "desc": "Transformer各层权重矩阵(Wq/Wk/Wv/Wo/MLP权重)+W_vocab+可学习位置嵌入参数"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "type": "词表",
|
|
|
+ "name": "Tokenizer词表",
|
|
|
+ "desc": "Token字符串到整数ID的双向映射,共vocab_size条"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "returns": {
|
|
|
+ "type": "输出Token",
|
|
|
+ "note": "第一个生成Token的字符/子词形式,可继续自回归循环"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "type_registry": {
|
|
|
+ "Token序列": {
|
|
|
+ "extends": "描述",
|
|
|
+ "desc": "Tokenizer将自然语言文本切分后得到的字符串token列表,如['今天','天气','很','好'],是文本在词表粒度上的离散序列表示"
|
|
|
+ },
|
|
|
+ "Token_ID序列": {
|
|
|
+ "extends": "描述",
|
|
|
+ "desc": "Token序列中每个token在词表中对应的整数索引列表,形状[n,],值域[0, vocab_size-1],是模型实际消费的数字化输入"
|
|
|
+ },
|
|
|
+ "注意力掩码": {
|
|
|
+ "extends": "蒙版",
|
|
|
+ "desc": "与padded token_ids同形状的二元矩阵,有效token位置=1,padding位置=0,用于在注意力计算中屏蔽padding位置"
|
|
|
+ },
|
|
|
+ "嵌入矩阵": {
|
|
|
+ "extends": "模型权重",
|
|
|
+ "desc": "词嵌入查表后得到的token向量矩阵,形状[n, d_model],每行是对应token在连续语义空间中的浮点表示"
|
|
|
+ },
|
|
|
+ "隐状态矩阵": {
|
|
|
+ "extends": "底图",
|
|
|
+ "desc": "Transformer各层输入/输出的中间表征矩阵,形状[n, d_model],承载当前层对序列的语义理解,是后续层持续改造的对象"
|
|
|
+ },
|
|
|
+ "QKV矩阵组": {
|
|
|
+ "extends": "底图",
|
|
|
+ "desc": "注意力机制中Query、Key、Value三个线性投影矩阵的组合,各形状[n, d_model],分别负责查询匹配、键索引和值聚合"
|
|
|
+ },
|
|
|
+ "多头QKV": {
|
|
|
+ "extends": "底图",
|
|
|
+ "desc": "将QKV矩阵reshape为多头形式后的张量,形状[n, n_heads, d_head],支持多头并行注意力计算"
|
|
|
+ },
|
|
|
+ "注意力分数矩阵": {
|
|
|
+ "extends": "底图",
|
|
|
+ "desc": "QK点积经缩放和因果掩码处理后的注意力分数,形状[n_heads, n, n],未来位置被置为负无穷"
|
|
|
+ },
|
|
|
+ "注意力输出矩阵": {
|
|
|
+ "extends": "底图",
|
|
|
+ "desc": "经softmax归一化后对V做加权求和得到的注意力输出,形状[n, n_heads, d_head],是多头注意力的直接产出"
|
|
|
+ },
|
|
|
+ "位置向量": {
|
|
|
+ "extends": "底图",
|
|
|
+ "desc": "最终隐状态矩阵最后一行XL[n-1],形状[d_model,],代表模型处理完整prompt后的上下文压缩表征"
|
|
|
+ },
|
|
|
+ "归一化向量": {
|
|
|
+ "extends": "底图",
|
|
|
+ "desc": "经Final LayerNorm处理后的最终特征向量,形状[d_model,],数值尺度对齐到LM Head期望输入范围"
|
|
|
+ },
|
|
|
+ "logits向量": {
|
|
|
+ "extends": "底图",
|
|
|
+ "desc": "LM Head对最终表征线性投影得到的未归一化预测分数,形状[vocab_size,],每个位置对应词表中一个token的预测得分"
|
|
|
+ },
|
|
|
+ "KV缓存": {
|
|
|
+ "extends": "生成参数",
|
|
|
+ "desc": "prefill阶段按层和注意力头缓存的K/V矩阵,每层形状[2, n_heads, n, d_head],供后续自回归decode阶段复用以避免重复计算"
|
|
|
+ },
|
|
|
+ "解码配置": {
|
|
|
+ "extends": "生成参数",
|
|
|
+ "desc": "控制解码行为的参数集合,包含temperature缩放系数、logits_bias词级偏置、repetition_penalty重复惩罚系数和bad_words_ids屏蔽列表"
|
|
|
+ },
|
|
|
+ "候选Token列表": {
|
|
|
+ "extends": "样图",
|
|
|
+ "desc": "经Top-k/Top-p截断后保留的候选token子集,形状仍[vocab_size,]但非候选位置logits被置为负无穷,等待最终采样决策"
|
|
|
+ },
|
|
|
+ "输出Token": {
|
|
|
+ "extends": "正文",
|
|
|
+ "desc": "经采样或argmax决策后产出的第一个生成token,对应词表中某个整数ID及其解码后的子词或字符,是本次推理的最终交付物"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "steps": [
|
|
|
+ {
|
|
|
+ "id": "s1",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<tokenizer>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "将Prompt文本按Tokenizer规则切分为token字符串序列"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "tokens=[t0,t1,...,tn-1];切分规则由模型/词表决定(BPE/WordPiece等);类比把一段话拆成一块块可拼装的乐高零件"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "提示词",
|
|
|
+ "name": "用户Prompt",
|
|
|
+ "value": "详细图解LLM推理流程,从Prompt到第一个Token(用户输入的完整自然语言文本,如问题、指令等)",
|
|
|
+ "anchor": "← 工序输入",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/AI应用",
|
|
|
+ "form": "/架构/叙事/叙事体裁/演说交流/科普教学"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "Token序列",
|
|
|
+ "name": "tokens",
|
|
|
+ "value": "<Prompt文本切分后的字符串token列表,如['今天','天气','很','好'],长度n取决于Prompt和词表粒度>",
|
|
|
+ "anchor": "→ s2",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": "/架构/逻辑/信息编排逻辑/条目列举/逐条罗列/序号列举"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "用 {via:<tokenizer>} 把 {in-type:提示词} 切分为 {out-type:Token序列}",
|
|
|
+ "focus": [
|
|
|
+ "via",
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "提取/化学提取/识别"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s2",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<tokenizer>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "在Tokenizer词表中查找每个token对应的整数ID"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "token_ids=[id0,id1,...,idn-1];目的:把符号映射成整数索引,便于embedding table以O(1)查表;类比把单词换成字典里的页码/编号"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "Token序列",
|
|
|
+ "name": "tokens",
|
|
|
+ "value": "<Prompt文本切分后的字符串token列表,如['今天','天气','很','好'],长度n取决于Prompt和词表粒度>",
|
|
|
+ "anchor": "← s1.tokens",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": "/架构/逻辑/信息编排逻辑/条目列举/逐条罗列/序号列举"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "Token_ID序列",
|
|
|
+ "name": "token_ids",
|
|
|
+ "value": "<tokens对应的整数ID列表,如[1234,5678,90,123],形状[n,],值域[0,vocab_size-1]>",
|
|
|
+ "anchor": "→ s3 / s4",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": "/架构/逻辑/信息编排逻辑/条目列举/逐条罗列/序号列举"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "词表查询把 {in-type:Token序列} 映射为 {out-type:Token_ID序列}",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "修改/变/转换"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s3",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<tokenizer>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "对batch内多条序列进行padding对齐,生成attention_mask标记有效位置"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "可选步骤:单序列推理时可省略;用途:并行计算多个样本+防止padding位置参与注意力计算;padding位置填特殊PAD_ID;类比把不同长度积木墙用同样盒子装,周围堆起空位占位但不影响"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "Token_ID序列",
|
|
|
+ "name": "token_ids",
|
|
|
+ "value": "<tokens对应的整数ID列表,如[1234,5678,90,123],形状[n,],值域[0,vocab_size-1]>",
|
|
|
+ "anchor": "← s2.token_ids",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "Token_ID序列",
|
|
|
+ "name": "padded_token_ids",
|
|
|
+ "value": "<padding后的token_ids,形状[batch_size,max_seq_len],PAD位置填充特殊PAD_ID>",
|
|
|
+ "anchor": "→ s4",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "type": "注意力掩码",
|
|
|
+ "name": "attention_mask",
|
|
|
+ "value": "<与padded_token_ids同形状的二元mask,有效token=1,padding=0;用于标记哪些位置参与注意力计算>",
|
|
|
+ "anchor": "→ s6.4",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "批处理填充对齐 {in-type:Token_ID序列} 生成 {out-type:注意力掩码}",
|
|
|
+ "focus": [
|
|
|
+ "out-type-1"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "生成/元素生成"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s4",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "查embedding table,将token_ids映射到连续向量空间X矩阵"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "X=Embedding(token_ids);形状[n,d_model];目的:把离散ID映射到连续向量空间;embedding权重是模型可学习参数;类比把编号变成坐标,在地图上更易处理"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "Token_ID序列",
|
|
|
+ "name": "token_ids",
|
|
|
+ "value": "<tokens对应的整数ID列表,如[1234,5678,90,123],形状[n,],值域[0,vocab_size-1]>",
|
|
|
+ "anchor": "← s2.token_ids",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "嵌入矩阵",
|
|
|
+ "name": "X",
|
|
|
+ "value": "<每个token_id对应的embedding向量堆叠,形状[n,d_model](如d_model=4096);每行是该token在语义空间的连续浮点表示>",
|
|
|
+ "anchor": "→ s5",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "查表把 {in-type:Token_ID序列} 映射为连续 {out-type:嵌入矩阵}",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "提取/化学提取/反推"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s5",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "方式A(绝对/可学习位置嵌入):X = X + PosEmbed(positions),将位置向量叠加到token embedding上"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "方式A:X=X+PosEmbed(positions),位置嵌入是可学习参数或正弦编码;方式B(RoPE/ALiBi):不在此处处理,在每层Attention子层内部对Q/K施加位置相关旋转变换(见s6.3);两方案区别:绝对位置告诉模型你在第几位,相对位置告诉模型你们相距多远"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "嵌入矩阵",
|
|
|
+ "name": "X",
|
|
|
+ "value": "<每个token_id对应的embedding向量堆叠,形状[n,d_model](如d_model=4096);每行是该token在语义空间的连续浮点表示>",
|
|
|
+ "anchor": "← s4.X",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "隐状态矩阵",
|
|
|
+ "name": "X",
|
|
|
+ "value": "<加入位置信息后的输入矩阵,形状[n,d_model];若方式A则X+=PosEmbed;若方式B(RoPE)则X不变,位置编码在s6.3对Q/K旋转注入>",
|
|
|
+ "anchor": "→ s6",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "注入位置信息到 {in-type:嵌入矩阵} 得带位置的 {out-type:隐状态矩阵}",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "修改/增/添加"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s6",
|
|
|
+ "kind": "block",
|
|
|
+ "control": "遍历",
|
|
|
+ "via": "-",
|
|
|
+ "feature": "-",
|
|
|
+ "intent": "{control:遍历} L个Transformer层做前向计算,逐层更新隐状态建立跨位置依赖",
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "隐状态矩阵",
|
|
|
+ "name": "X",
|
|
|
+ "value": "<加入位置信息后的输入矩阵,形状[n,d_model];若方式A则X+=PosEmbed;若方式B(RoPE)则X不变,位置编码在s6.3对Q/K旋转注入>",
|
|
|
+ "anchor": "← s5.X",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "隐状态矩阵",
|
|
|
+ "name": "XL",
|
|
|
+ "value": "<经L层Transformer前向计算后的最终隐状态,形状[n,d_model];XL[n-1]代表读完整段prompt后的当前上下文状态>",
|
|
|
+ "anchor": "→ s7",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "effect": "主体生成",
|
|
|
+ "action": "生成/元素生成"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s6.1",
|
|
|
+ "kind": "nested",
|
|
|
+ "group": "s6",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "Pre-Norm:H=Norm(X)(RMSNorm/LayerNorm);线性投影:Q=H*Wq, K=H*Wk, V=H*Wv"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "Pre-Norm稳定数值尺度,降低激活异常风险;Q/K/V把同一隐状态映射到查询/键/值三个子空间;类比名片(Q)/目录索引(K)/具体内容(V)三件事分开做"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "隐状态矩阵",
|
|
|
+ "name": "X",
|
|
|
+ "value": "<当前层l的输入隐状态,形状[n,d_model]>",
|
|
|
+ "anchor": "← s6.X[l]",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "QKV矩阵组",
|
|
|
+ "name": "Q_K_V",
|
|
|
+ "value": "<Q/K/V各形状[n,d_model];H分别右乘Wq/Wk/Wv线性变换得到;Q负责查询匹配,K负责键索引,V负责值聚合>",
|
|
|
+ "anchor": "→ s6.2",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "Pre-Norm后线性投影 {in-type:隐状态矩阵} 得 {out-type:QKV矩阵组}",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "提取/化学提取/反推"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s6.2",
|
|
|
+ "kind": "nested",
|
|
|
+ "group": "s6",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "将Q/K/V从[n,d_model]reshape为[n,n_heads,d_head],拆分为多个注意力头"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "d_head=d_model/n_heads;目的:让不同head学到不同关系子空间,并行建模多种依赖;类比同一段话同时从语法、指代、主题等多个角度去读"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "QKV矩阵组",
|
|
|
+ "name": "Q_K_V",
|
|
|
+ "value": "<Q/K/V各形状[n,d_model];H分别右乘Wq/Wk/Wv线性变换得到;Q负责查询匹配,K负责键索引,V负责值聚合>",
|
|
|
+ "anchor": "← s6.1.Q_K_V",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "多头QKV",
|
|
|
+ "name": "Q_K_V_heads",
|
|
|
+ "value": "<Q/K/V各reshape为[n,n_heads,d_head];Head 1关注语法关系,Head 2关注指代,Head h关注主题等,并行计算>",
|
|
|
+ "anchor": "→ s6.3",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "reshape {in-type:QKV矩阵组} 为 {out-type:多头QKV} 准备并行注意力",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "修改/变/转换"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s6.3",
|
|
|
+ "kind": "nested",
|
|
|
+ "group": "s6",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "若使用RoPE:对每个head的Q/K应用旋转位置编码(旋转矩阵依赖token相对位置),V不做位置编码"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "RoPE目的:以相对关系形式将位置信息注入注意力匹配,保持相对距离语义;类比把相对距离编码进匹配规则里;若已在s5使用绝对位置嵌入,此步可跳过;ALiBi是另一种方案:在attention score上直接加偏置"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "多头QKV",
|
|
|
+ "name": "Q_K_V_heads",
|
|
|
+ "value": "<Q/K/V各reshape为[n,n_heads,d_head];Head 1关注语法关系,Head 2关注指代,Head h关注主题等,并行计算>",
|
|
|
+ "anchor": "← s6.2.Q_K_V_heads",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "多头QKV",
|
|
|
+ "name": "Q_K_V_pos",
|
|
|
+ "value": "<Q'/K'已施加RoPE旋转携带相对位置信息,V不变;可进行位置感知的注意力计算>",
|
|
|
+ "anchor": "→ s6.4",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "对 {in-type:多头QKV} 的Q/K施加RoPE旋转注入相对位置信息",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "修改/变/调整"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s6.4",
|
|
|
+ "kind": "nested",
|
|
|
+ "group": "s6",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "计算注意力分数S=QK^T/sqrt(d_head);施加因果mask(Causal Mask)将未来位置设为-∞,并结合padding mask"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "Causal Mask确保当前位置不能看未来信息(自回归语义正确性);-∞经softmax后→0;padding mask防止填充位置参与计算;类比读书时把后面的内容遮住"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "多头QKV",
|
|
|
+ "name": "Q_K_V_pos",
|
|
|
+ "value": "<Q'/K'已施加RoPE旋转携带相对位置信息,V不变;可进行位置感知的注意力计算>",
|
|
|
+ "anchor": "← s6.3.Q_K_V_pos",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "type": "注意力掩码",
|
|
|
+ "name": "attention_mask",
|
|
|
+ "value": "<与padded_token_ids同形状的二元mask,有效token=1,padding=0;用于标记哪些位置参与注意力计算>",
|
|
|
+ "anchor": "← s3.attention_mask",
|
|
|
+ "inferred": true,
|
|
|
+ "inferred_reason": "Causal Mask+Padding Mask组合施加,padding_mask来自s3的attention_mask;原文图5明确展示两者结合",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "注意力分数矩阵",
|
|
|
+ "name": "S_masked",
|
|
|
+ "value": "<形状[n_heads,n,n];未来位置=-∞(mask后),历史/当前位置=有效QK点积分数/sqrt(d_head)>",
|
|
|
+ "anchor": "→ s6.5",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "点积计算并施加Causal Mask得 {out-type:注意力分数矩阵}",
|
|
|
+ "focus": [
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "生成/元素生成"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s6.5",
|
|
|
+ "kind": "nested",
|
|
|
+ "group": "s6",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "对S_masked最后一维做softmax归一化得注意力权重A=softmax(S);加权求和O=AV"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "Softmax把分数归一化为概率分布(和=1),便于加权求和;O=AV,每个位置的输出是所有历史V的注意力加权平均;类比把多个小组意见按权重相加得一份摘要"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "注意力分数矩阵",
|
|
|
+ "name": "S_masked",
|
|
|
+ "value": "<形状[n_heads,n,n];未来位置=-∞(mask后),历史/当前位置=有效QK点积分数/sqrt(d_head)>",
|
|
|
+ "anchor": "← s6.4.S_masked",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "注意力输出矩阵",
|
|
|
+ "name": "O_heads",
|
|
|
+ "value": "<形状[n,n_heads,d_head];每个位置的O是历史V向量的注意力加权平均;A=softmax(S),O=AV>",
|
|
|
+ "anchor": "→ s6.6",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "Softmax归一化后加权聚合V得 {out-type:注意力输出矩阵}",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "主体生成",
|
|
|
+ "action": "生成/元素生成"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s6.6",
|
|
|
+ "kind": "nested",
|
|
|
+ "group": "s6",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "输出投影:O=concat(all_heads)*Wo,将多头结果投影回d_model;残差连接1:X'=X+O"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "输出投影concat各head后经Wo线性变换回d_model;残差连接让深层网络更易梯度传播;类比在改稿时保留原稿主线再叠加补充内容"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "注意力输出矩阵",
|
|
|
+ "name": "O_heads",
|
|
|
+ "value": "<形状[n,n_heads,d_head];每个位置的O是历史V向量的注意力加权平均;A=softmax(S),O=AV>",
|
|
|
+ "anchor": "← s6.5.O_heads",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "type": "隐状态矩阵",
|
|
|
+ "name": "X",
|
|
|
+ "value": "<当前层l的输入隐状态,形状[n,d_model]>",
|
|
|
+ "anchor": "← s6.1.X",
|
|
|
+ "inferred": true,
|
|
|
+ "inferred_reason": "残差连接X'=X+O需要当前层输入X,是Transformer标准架构必经步骤",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "隐状态矩阵",
|
|
|
+ "name": "X_prime",
|
|
|
+ "value": "<Attention子层完成后的中间隐状态,形状[n,d_model];X'=X+O*Wo,已融合多头注意力信息>",
|
|
|
+ "anchor": "→ s6.7",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "输出投影+残差连接完成Attention子层输出 {out-type:隐状态矩阵}",
|
|
|
+ "focus": [
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "主体生成",
|
|
|
+ "action": "修改/增/添加"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s6.7",
|
|
|
+ "kind": "nested",
|
|
|
+ "group": "s6",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "FFN/MLP子层:M=MLP(Norm(X')),两层线性+激活(SwiGLU/GEGLU/GELU);残差连接2:X=X'+M"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "MLP:Norm(X')→Linear(d_model→4d_model)→激活→Linear(4d_model→d_model);提供每个token位置的非线性变换与通道混合能力,补足注意力的表达能力;类比对每个位置做一次本地加工/润色"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "隐状态矩阵",
|
|
|
+ "name": "X_prime",
|
|
|
+ "value": "<Attention子层完成后的中间隐状态,形状[n,d_model];X'=X+O*Wo,已融合多头注意力信息>",
|
|
|
+ "anchor": "← s6.6.X_prime",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "隐状态矩阵",
|
|
|
+ "name": "X_updated",
|
|
|
+ "value": "<当前Transformer层最终输出,形状[n,d_model];X=X'+M,经Attention+MLP两个子层处理,传入下一层>",
|
|
|
+ "anchor": "→ s6[i+1] / → s6.8",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "MLP/FFN非线性变换+残差连接得更新后的 {out-type:隐状态矩阵}",
|
|
|
+ "focus": [
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "主体生成",
|
|
|
+ "action": "修改/增/添加"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s6.8",
|
|
|
+ "kind": "nested",
|
|
|
+ "group": "s6",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "写外部",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "将当前层l的K和V矩阵(按head)写入KV Cache存储"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "KV Cache作用:后续每生成新token时不必重新计算整段prompt的K/V,解码复杂度从O(T²)降至近似O(T);存储结构:每层[2,n_heads,seq_len,d_head];类比做题时把已读材料做成笔记后面不必从头翻"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "QKV矩阵组",
|
|
|
+ "name": "K_V_current",
|
|
|
+ "value": "<当前层l的K和V矩阵,各形状[n,n_heads,d_head],将被缓存供后续decode阶段复用>",
|
|
|
+ "anchor": "← s6.1.Q_K_V",
|
|
|
+ "inferred": true,
|
|
|
+ "inferred_reason": "KV Cache写入需要当前层K/V,来自s6.1的QKV投影;原文图7明确展示此步骤",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "KV缓存",
|
|
|
+ "name": "kv_cache",
|
|
|
+ "value": "<按层和head组织的K/V缓存;prefill阶段写满所有n个位置;每层形状[2,n_heads,n,d_head](2=K+V);供后续续token生成的decode阶段读取>",
|
|
|
+ "anchor": "→ 返回 KV缓存",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "将当前层K/V写入 {out-type:KV缓存} 供decode阶段复用",
|
|
|
+ "focus": [
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "存储/暂存/缓存"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s7",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "从最终隐状态XL中取最后一个序列位置:x_last=XL[n-1]"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "Prefill阶段对prompt每个位置都有隐状态输出,但只需最后位置x_last生成第一个token;x_last代表读完整段prompt后的当前上下文压缩表示"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "隐状态矩阵",
|
|
|
+ "name": "XL",
|
|
|
+ "value": "<经L层Transformer前向计算后的最终隐状态,形状[n,d_model];XL[n-1]代表读完整段prompt后的当前上下文状态>",
|
|
|
+ "anchor": "← s6.XL",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "位置向量",
|
|
|
+ "name": "x_last",
|
|
|
+ "value": "<XL最后一行XL[n-1],形状[d_model,];代表模型读完整段prompt后的当前上下文压缩状态>",
|
|
|
+ "anchor": "→ s8",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "取 {in-type:隐状态矩阵} 最后位置得 {out-type:位置向量} x_last",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "提取/物理提取/裁切"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s8",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "Final Norm:h=FinalNorm(x_last),常见为RMSNorm/LayerNorm"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "目的:把最终表征对齐到LM Head期望的数值尺度;类比出门前把指南针校准;取决于具体模型结构是否有FinalNorm(部分模型有,部分没有)"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "位置向量",
|
|
|
+ "name": "x_last",
|
|
|
+ "value": "<XL最后一行XL[n-1],形状[d_model,];代表模型读完整段prompt后的当前上下文压缩状态>",
|
|
|
+ "anchor": "← s7.x_last",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "归一化向量",
|
|
|
+ "name": "h",
|
|
|
+ "value": "<经FinalNorm处理后的最终表征,形状[d_model,];数值尺度对齐到LM Head期望范围>",
|
|
|
+ "anchor": "→ s9",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "FinalNorm归一化 {in-type:位置向量} 得对齐尺度的 {out-type:归一化向量} h",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "修改/变/增强"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s9",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "LM Head矩阵乘法:logits=h*W_vocab,W_vocab形状[d_model,vocab_size]"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "logits是未归一化原始分数,形状[vocab_size,];代表第一个输出token的候选得分,覆盖整个词表;还未经softmax不是概率;类比把当前想法翻译成一张候选词得分榜"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "归一化向量",
|
|
|
+ "name": "h",
|
|
|
+ "value": "<经FinalNorm处理后的最终表征,形状[d_model,];数值尺度对齐到LM Head期望范围>",
|
|
|
+ "anchor": "← s8.h",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "logits向量",
|
|
|
+ "name": "logits",
|
|
|
+ "value": "<形状[vocab_size,]的未归一化原始分数,如[3.2,2.8,1.5,...,-0.3];每个位置对应词表一个token的预测得分;还未经softmax不是概率>",
|
|
|
+ "anchor": "→ s10",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "LM Head矩阵乘法得覆盖全词表的 {out-type:logits向量}",
|
|
|
+ "focus": [
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "主体生成",
|
|
|
+ "action": "生成/元素生成"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s10",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "可选:Temperature缩放(logits/T)+Logits bias(特定token加减分)+重复惩罚(降低已出现token分数)+Bad words mask(直接屏蔽某些token)"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "可选步骤:目的是把基础模型偏好对齐到产品目标与可控性要求;T越低越确定,T越高越发散;Logits bias用于安全/风格控制;类比在出答案前调一调说话风格/谨慎程度旋钮"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "logits向量",
|
|
|
+ "name": "logits",
|
|
|
+ "value": "<形状[vocab_size,]的未归一化原始分数,如[3.2,2.8,1.5,...,-0.3];每个位置对应词表一个token的预测得分;还未经softmax不是概率>",
|
|
|
+ "anchor": "← s9.logits",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "type": "解码配置",
|
|
|
+ "name": "decoding_params",
|
|
|
+ "value": "<解码参数配置:temperature(如1.0),logits_bias(token级加减分词典),repetition_penalty(重复惩罚系数),bad_words_ids(屏蔽token列表)>",
|
|
|
+ "anchor": "← 工序输入",
|
|
|
+ "inferred": true,
|
|
|
+ "inferred_reason": "logits处理需要解码参数,原文体文Step3明确提到加载生成参数(温度、Top-P采样等)",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/AI应用",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "logits向量",
|
|
|
+ "name": "adjusted_logits",
|
|
|
+ "value": "<经Temperature缩放+偏置+惩罚+屏蔽处理后的logits,形状仍[vocab_size,];不期望token得分被调整或置为-∞>",
|
|
|
+ "anchor": "→ s11",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "用 {in-type:解码配置} 对 {in-type:logits向量} 做Temperature等调整",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "in-type-1",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "修改/变/调整"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s11",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "幂等",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "可选截断:Top-k(只保留前k个分数最高的token)/ Top-p(累积概率达p截断)/ Min-p(低于阈值全过滤)/ Typical(保留类型性制的)"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "可选步骤:降低长尾噪声,提升可读性/一致性;先列候选短名单再做决定;Top-k和Top-p常组合使用"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "logits向量",
|
|
|
+ "name": "adjusted_logits",
|
|
|
+ "value": "<经Temperature缩放+偏置+惩罚+屏蔽处理后的logits,形状仍[vocab_size,];不期望token得分被调整或置为-∞>",
|
|
|
+ "anchor": "← s10.adjusted_logits",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "候选Token列表",
|
|
|
+ "name": "candidate_logits",
|
|
|
+ "value": "<经Top-k/Top-p截断后的候选token子集,其余位置logits置为-∞;形状仍[vocab_size,]但有效候选数远小于vocab_size>",
|
|
|
+ "anchor": "→ s12",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": "/架构/逻辑/信息编排逻辑/条目列举/逐条罗列/通用列举"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "Top-k/Top-p截断 {in-type:logits向量} 得精简 {out-type:候选Token列表}",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "预处理",
|
|
|
+ "action": "提取/物理提取/裁切"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "id": "s12",
|
|
|
+ "kind": "step",
|
|
|
+ "via": "<llm>",
|
|
|
+ "feature": "随机",
|
|
|
+ "control": "-",
|
|
|
+ "instruction": [
|
|
|
+ [
|
|
|
+ "directive",
|
|
|
+ "Sampling:softmax(candidate_logits)得概率分布→随机按概率采样;Greedy:argmax(candidate_logits)直接选最高分;得到next_token_id后Detokenize为字符/子词"
|
|
|
+ ],
|
|
|
+ [
|
|
|
+ "memo",
|
|
|
+ "Sampling目的:多样性;Greedy目的:确定性可复现;两者选择取决于应用场景;Detokenize把整数ID还原为用户可见文本;第一个token产出后进入自回归循环"
|
|
|
+ ]
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "type": "候选Token列表",
|
|
|
+ "name": "candidate_logits",
|
|
|
+ "value": "<经Top-k/Top-p截断后的候选token子集,其余位置logits置为-∞;形状仍[vocab_size,]但有效候选数远小于vocab_size>",
|
|
|
+ "anchor": "← s11.candidate_logits",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/机器学习",
|
|
|
+ "form": "/架构/逻辑/信息编排逻辑/条目列举/逐条罗列/通用列举"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "type": "输出Token",
|
|
|
+ "name": "first_token",
|
|
|
+ "value": "<第一个生成Token,如next_token_id=1234对应'的';Detokenize后为字符或子词(如'##子'前缀);后续进入自回归循环续token生成>",
|
|
|
+ "anchor": "→ 返回 first_token",
|
|
|
+ "substance": "/理念/知识/商业/前沿技术/AI智能/AI应用",
|
|
|
+ "form": null
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "intent": "采样/贪心从 {in-type:候选Token列表} 选出并Detokenize为 {out-type:输出Token}",
|
|
|
+ "focus": [
|
|
|
+ "in-type-0",
|
|
|
+ "out-type-0"
|
|
|
+ ],
|
|
|
+ "effect": "交付",
|
|
|
+ "action": "生成/元素生成"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "return_row": {
|
|
|
+ "arrow": "↩",
|
|
|
+ "text": "<span class=\"kw\">返回</span> <span class=\"chip\" data-type=\"输出Token\">输出Token</span> 第一个生成Token(字符/子词,可继续自回归循环)"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ ]
|
|
|
+}
|