1 rok temu · 9e2f5e6b3a
--- a/tools/llama/generate.py
+++ b/tools/llama/generate.py
@@ -2,6 +2,7 @@ import os
 
															 import queue
														
 
															 import threading
														
 
															 import time
														
 
															+from contextlib import nullcontext
														
 
															 from dataclasses import dataclass
														
 
															 from pathlib import Path
														
 
															 from typing import Literal, Optional, Tuple, Union
														
@@ -181,8 +182,12 @@ def decode_n_tokens(
 
															         else:
														
 
															             window = previous_tokens[:, i - win_size : i]
														
 
															-        with torch.backends.cuda.sdp_kernel(
														
 
															-            enable_flash=False, enable_mem_efficient=False, enable_math=True
														
 
															+        with (
														
 
															+            torch.backends.cuda.sdp_kernel(
														
 
															+                enable_flash=False, enable_mem_efficient=False, enable_math=True
														
 
															+            )
														
 
															+            if torch.cuda.is_available()
														
 
															+            else nullcontext()
														
 
															         ):  # Actually better for Inductor to codegen attention here
														
 
															             next_token = decode_one_token(
														
 
															                 model=model,