First pass at dynamically calculating the working memory requirements for the VAE decoding operation. Still need to tune SD3 and FLUX.

2025-01-07 03:17:05 +08:00 · 2024-12-19 15:26:16 -05:00 · 2024-12-19 15:26:16 -05:00 · f01e41ceaf
commit f01e41ceaf
parent 609ed06265
3 changed files with 54 additions and 8 deletions
--- a/invokeai/app/invocations/flux_vae_decode.py
+++ b/invokeai/app/invocations/flux_vae_decode.py
@ -3,6 +3,7 @@ from einops import rearrange
 from PIL import Image

 from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
 from invokeai.app.invocations.fields import (
    FieldDescriptions,
    Input,
@ -38,8 +39,22 @@ class FluxVaeDecodeInvocation(BaseInvocation, WithMetadata, WithBoard):
        input=Input.Connection,
    )

+    def _estimate_working_memory(self, latents: torch.Tensor, vae: AutoEncoder) -> int:
+        """Estimate the working memory required by the invocation in bytes."""
+        # It was found experimentally that the peak working memory scales linearly with the number of pixels and the
+        # element size (precision). This estimate is accurate for both SD1 and SDXL.
+        out_h = LATENT_SCALE_FACTOR * latents.shape[-2]
+        out_w = LATENT_SCALE_FACTOR * latents.shape[-1]
+        element_size = next(vae.parameters()).element_size()
+        # TODO(ryand): Need to tune this value, it was copied from the SD1 implementation.
+        scaling_constant = 960  # Determined experimentally.
+        working_memory = out_h * out_w * element_size * scaling_constant
+
+        return working_memory
+
    def _vae_decode(self, vae_info: LoadedModel, latents: torch.Tensor) -> Image.Image:
-        with vae_info as vae:
+        estimated_working_memory = self._estimate_working_memory(latents, vae_info.model)
+        with vae_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, vae):
            assert isinstance(vae, AutoEncoder)
            vae_dtype = next(iter(vae.parameters())).dtype
            latents = latents.to(device=TorchDevice.choose_torch_device(), dtype=vae_dtype)
--- a/invokeai/app/invocations/latents_to_image.py
+++ b/invokeai/app/invocations/latents_to_image.py
@ -34,7 +34,7 @@ from invokeai.backend.util.devices import TorchDevice
    title="Latents to Image",
    tags=["latents", "image", "vae", "l2i"],
    category="latents",
-    version="1.3.0",
+    version="1.3.1",
 )
 class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
    """Generates an image from latents."""
@ -53,18 +53,31 @@ class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
    tile_size: int = InputField(default=0, multiple_of=8, description=FieldDescriptions.vae_tile_size)
    fp32: bool = InputField(default=DEFAULT_PRECISION == torch.float32, description=FieldDescriptions.fp32)

+    def _estimate_working_memory(self, latents: torch.Tensor) -> int:
+        """Estimate the working memory required by the invocation in bytes."""
+        # It was found experimentally that the peak working memory scales linearly with the number of pixels and the
+        # element size (precision). This estimate is accurate for both SD1 and SDXL.
+        out_h = LATENT_SCALE_FACTOR * latents.shape[-2]
+        out_w = LATENT_SCALE_FACTOR * latents.shape[-1]
+        element_size = 4 if self.fp32 else 2
+        scaling_constant = 960  # Determined experimentally.
+        working_memory = out_h * out_w * element_size * scaling_constant
+
+        if self.fp32:
+            # If we are running in FP32, then we should account for the likely increase in model size (~250MB).
+            working_memory += 250 * 2**20
+
+        return working_memory
+
    @torch.no_grad()
    def invoke(self, context: InvocationContext) -> ImageOutput:
        latents = context.tensors.load(self.latents.latents_name)

        vae_info = context.models.load(self.vae.vae)
        assert isinstance(vae_info.model, (AutoencoderKL, AutoencoderTiny))
-        # Reserve 6GB of VRAM for the VAE.
-        # Experimentally, this was found to be sufficient for decoding a 1024x1024 image.
-        # TODO(ryand): Set the requested working memory dynamically based on the image size (and self.fp32).
        with (
            SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes),
-            vae_info.model_on_device(working_mem_bytes=6 * 2**30) as (_, vae),
+            vae_info.model_on_device(working_mem_bytes=self._estimate_working_memory(latents)) as (_, vae),
        ):
            context.util.signal_progress("Running VAE decoder")
            assert isinstance(vae, (AutoencoderKL, AutoencoderTiny))
--- a/invokeai/app/invocations/sd3_latents_to_image.py
+++ b/invokeai/app/invocations/sd3_latents_to_image.py
@ -6,6 +6,7 @@ from einops import rearrange
 from PIL import Image

 from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
 from invokeai.app.invocations.fields import (
    FieldDescriptions,
    Input,
@ -26,7 +27,7 @@ from invokeai.backend.util.devices import TorchDevice
    title="SD3 Latents to Image",
    tags=["latents", "image", "vae", "l2i", "sd3"],
    category="latents",
-    version="1.3.0",
+    version="1.3.1",
 )
 class SD3LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
    """Generates an image from latents."""
@ -40,13 +41,30 @@ class SD3LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
        input=Input.Connection,
    )

+    def _estimate_working_memory(self, latents: torch.Tensor, vae: AutoencoderKL) -> int:
+        """Estimate the working memory required by the invocation in bytes."""
+        # It was found experimentally that the peak working memory scales linearly with the number of pixels and the
+        # element size (precision). This estimate is accurate for both SD1 and SDXL.
+        out_h = LATENT_SCALE_FACTOR * latents.shape[-2]
+        out_w = LATENT_SCALE_FACTOR * latents.shape[-1]
+        element_size = next(vae.parameters()).element_size()
+        # TODO(ryand): Need to tune this value, it was copied from the SD1 implementation.
+        scaling_constant = 960  # Determined experimentally.
+        working_memory = out_h * out_w * element_size * scaling_constant
+
+        return working_memory
+
    @torch.no_grad()
    def invoke(self, context: InvocationContext) -> ImageOutput:
        latents = context.tensors.load(self.latents.latents_name)

        vae_info = context.models.load(self.vae.vae)
        assert isinstance(vae_info.model, (AutoencoderKL))
-        with SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes), vae_info as vae:
+        estimated_working_memory = self._estimate_working_memory(latents, vae_info.model)
+        with (
+            SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes),
+            vae_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, vae),
+        ):
            context.util.signal_progress("Running VAE")
            assert isinstance(vae, (AutoencoderKL))
            latents = latents.to(vae.device)