diff --git a/integration/BitNet/vllm_workspace/inference_with_compress_format.py b/integration/BitNet/vllm_workspace/inference_with_compress_format.py index 9e60fa974..55a24543e 100644 --- a/integration/BitNet/vllm_workspace/inference_with_compress_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_compress_format.py @@ -35,7 +35,9 @@ ckpt_path, dtype="half", quantization="bitblas", - enforce_eager=True, # set False to enable cuda graph + # set enforce_eager = False to enable cuda graph + # set enforce_eager = True to disable cuda graph + enforce_eager=False, ) as bitnet_model: bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=1024) diff --git a/integration/BitNet/vllm_workspace/inference_with_native_format.py b/integration/BitNet/vllm_workspace/inference_with_native_format.py index 579c5e17d..4f5f87f6f 100644 --- a/integration/BitNet/vllm_workspace/inference_with_native_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_native_format.py @@ -18,7 +18,7 @@ # get the path of the current file current_file_path = os.path.realpath(__file__) current_dir = os.path.dirname(current_file_path) -ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B_bitblas") +ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B") parser = argparse.ArgumentParser(description="Inference with BitNet") parser.add_argument( @@ -35,8 +35,11 @@ with VllmRunner( ckpt_path, dtype="half", - quantization="bitnet", + quantization="bitnet_bitblas", gpu_memory_utilization=0.5, + # set enforce_eager = False to enable cuda graph + # set enforce_eager = True to disable cuda graph + enforce_eager=False, ) as bitnet_model: bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=128) print("bitnet inference output:")