1 年間前 · 9e2f5e6b3a
--- a/tools/llama/generate.py
+++ b/tools/llama/generate.py
@@ -2,6 +2,7 @@ import os
 
				 import queue
			
 
				 import threading
			
 
				 import time
			
 
				+from contextlib import nullcontext
			
 
				 from dataclasses import dataclass
			
 
				 from pathlib import Path
			
 
				 from typing import Literal, Optional, Tuple, Union
			
@@ -181,8 +182,12 @@ def decode_n_tokens(
 
				         else:
			
 
				             window = previous_tokens[:, i - win_size : i]
			
 
				 
			
 
				-        with torch.backends.cuda.sdp_kernel(
			
 
				-            enable_flash=False, enable_mem_efficient=False, enable_math=True
			
 
				+        with (
			
 
				+            torch.backends.cuda.sdp_kernel(
			
 
				+                enable_flash=False, enable_mem_efficient=False, enable_math=True
			
 
				+            )
			
 
				+            if torch.cuda.is_available()
			
 
				+            else nullcontext()
			
 
				         ):  # Actually better for Inductor to codegen attention here
			
 
				             next_token = decode_one_token(
			
 
				                 model=model,