mirror of
https://github.com/invoke-ai/InvokeAI.git
synced 2025-01-08 11:57:36 +08:00
Reduce peak memory used for unit tests.
This commit is contained in:
parent
f8a6accf8a
commit
a83a999b79
@ -40,7 +40,7 @@ def linear_8bit_lt_layer():
|
|||||||
def test_custom_invoke_linear_8bit_lt_all_weights_on_cuda(linear_8bit_lt_layer: InvokeLinear8bitLt):
|
def test_custom_invoke_linear_8bit_lt_all_weights_on_cuda(linear_8bit_lt_layer: InvokeLinear8bitLt):
|
||||||
"""Test CustomInvokeLinear8bitLt inference with all weights on the GPU."""
|
"""Test CustomInvokeLinear8bitLt inference with all weights on the GPU."""
|
||||||
# Run inference on the original layer.
|
# Run inference on the original layer.
|
||||||
x = torch.randn(10, 32).to("cuda")
|
x = torch.randn(1, 32).to("cuda")
|
||||||
y_quantized = linear_8bit_lt_layer(x)
|
y_quantized = linear_8bit_lt_layer(x)
|
||||||
|
|
||||||
# Wrap the InvokeLinear8bitLt layer in a CustomInvokeLinear8bitLt layer, and run inference on it.
|
# Wrap the InvokeLinear8bitLt layer in a CustomInvokeLinear8bitLt layer, and run inference on it.
|
||||||
@ -54,7 +54,7 @@ def test_custom_invoke_linear_8bit_lt_all_weights_on_cuda(linear_8bit_lt_layer:
|
|||||||
def test_custom_invoke_linear_8bit_lt_all_weights_on_cpu(linear_8bit_lt_layer: InvokeLinear8bitLt):
|
def test_custom_invoke_linear_8bit_lt_all_weights_on_cpu(linear_8bit_lt_layer: InvokeLinear8bitLt):
|
||||||
"""Test CustomInvokeLinear8bitLt inference with all weights on the CPU (streaming to the GPU)."""
|
"""Test CustomInvokeLinear8bitLt inference with all weights on the CPU (streaming to the GPU)."""
|
||||||
# Run inference on the original layer.
|
# Run inference on the original layer.
|
||||||
x = torch.randn(10, 32).to("cuda")
|
x = torch.randn(1, 32).to("cuda")
|
||||||
y_quantized = linear_8bit_lt_layer(x)
|
y_quantized = linear_8bit_lt_layer(x)
|
||||||
|
|
||||||
# Copy the state dict to the CPU and reload it.
|
# Copy the state dict to the CPU and reload it.
|
||||||
@ -98,7 +98,7 @@ def linear_nf4_layer():
|
|||||||
def test_custom_invoke_linear_nf4_all_weights_on_cuda(linear_nf4_layer: InvokeLinearNF4):
|
def test_custom_invoke_linear_nf4_all_weights_on_cuda(linear_nf4_layer: InvokeLinearNF4):
|
||||||
"""Test CustomInvokeLinearNF4 inference with all weights on the GPU."""
|
"""Test CustomInvokeLinearNF4 inference with all weights on the GPU."""
|
||||||
# Run inference on the original layer.
|
# Run inference on the original layer.
|
||||||
x = torch.randn(10, 32).to("cuda")
|
x = torch.randn(1, 32).to("cuda")
|
||||||
y_quantized = linear_nf4_layer(x)
|
y_quantized = linear_nf4_layer(x)
|
||||||
|
|
||||||
# Wrap the InvokeLinearNF4 layer in a CustomInvokeLinearNF4 layer, and run inference on it.
|
# Wrap the InvokeLinearNF4 layer in a CustomInvokeLinearNF4 layer, and run inference on it.
|
||||||
@ -112,7 +112,7 @@ def test_custom_invoke_linear_nf4_all_weights_on_cuda(linear_nf4_layer: InvokeLi
|
|||||||
def test_custom_invoke_linear_nf4_all_weights_on_cpu(linear_nf4_layer: InvokeLinearNF4):
|
def test_custom_invoke_linear_nf4_all_weights_on_cpu(linear_nf4_layer: InvokeLinearNF4):
|
||||||
"""Test CustomInvokeLinearNF4 inference with all weights on the CPU (streaming to the GPU)."""
|
"""Test CustomInvokeLinearNF4 inference with all weights on the CPU (streaming to the GPU)."""
|
||||||
# Run inference on the original layer.
|
# Run inference on the original layer.
|
||||||
x = torch.randn(10, 32).to(device="cuda")
|
x = torch.randn(1, 32).to(device="cuda")
|
||||||
y_quantized = linear_nf4_layer(x)
|
y_quantized = linear_nf4_layer(x)
|
||||||
|
|
||||||
# Copy the state dict to the CPU and reload it.
|
# Copy the state dict to the CPU and reload it.
|
||||||
|
@ -57,7 +57,7 @@ def test_torch_module_autocast_linear_layer(device: torch.device, model: torch.n
|
|||||||
assert all(p.device.type == "cpu" for p in model.parameters())
|
assert all(p.device.type == "cpu" for p in model.parameters())
|
||||||
|
|
||||||
# Run inference on the CPU.
|
# Run inference on the CPU.
|
||||||
x = torch.randn(10, 32, device="cpu")
|
x = torch.randn(1, 32, device="cpu")
|
||||||
expected = model(x)
|
expected = model(x)
|
||||||
assert expected.device.type == "cpu"
|
assert expected.device.type == "cpu"
|
||||||
|
|
||||||
@ -103,7 +103,7 @@ def test_torch_module_autocast_bnb_llm_int8_linear_layer():
|
|||||||
assert model.linear.weight.SCB is not None
|
assert model.linear.weight.SCB is not None
|
||||||
|
|
||||||
# Run inference on the GPU.
|
# Run inference on the GPU.
|
||||||
x = torch.randn(10, 32)
|
x = torch.randn(1, 32)
|
||||||
expected = model(x.to("cuda"))
|
expected = model(x.to("cuda"))
|
||||||
assert expected.device.type == "cuda"
|
assert expected.device.type == "cuda"
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ def test_invoke_linear_8bit_lt_quantization():
|
|||||||
assert quantized_layer.weight.CB.dtype == torch.int8
|
assert quantized_layer.weight.CB.dtype == torch.int8
|
||||||
|
|
||||||
# Run inference on both the original and quantized layers.
|
# Run inference on both the original and quantized layers.
|
||||||
x = torch.randn(10, 32)
|
x = torch.randn(1, 32)
|
||||||
y = orig_layer(x)
|
y = orig_layer(x)
|
||||||
y_quantized = quantized_layer(x.to("cuda"))
|
y_quantized = quantized_layer(x.to("cuda"))
|
||||||
assert y.shape == y_quantized.shape
|
assert y.shape == y_quantized.shape
|
||||||
@ -53,7 +53,7 @@ def test_invoke_linear_8bit_lt_state_dict_roundtrip():
|
|||||||
orig_layer_state_dict = orig_layer.state_dict()
|
orig_layer_state_dict = orig_layer.state_dict()
|
||||||
|
|
||||||
# Run inference on the original layer.
|
# Run inference on the original layer.
|
||||||
x = torch.randn(10, 32)
|
x = torch.randn(1, 32)
|
||||||
y = orig_layer(x)
|
y = orig_layer(x)
|
||||||
|
|
||||||
# Prepare a quantized InvokeLinear8bitLt layer.
|
# Prepare a quantized InvokeLinear8bitLt layer.
|
||||||
|
Loading…
Reference in New Issue
Block a user