reduce VRAM memory usage by half during model loading

* This moves the call to half() before model.to(device) to avoid GPU copy of full model. Improves speed and reduces memory usage dramatically * This fix contributed by @mh-dm (Mihai)
2025-01-08 11:57:36 +08:00 · 2022-09-10 10:02:43 -04:00 · 2022-09-10 10:02:43 -04:00 · 5c43988862
commit 5c43988862
parent 99122708ca
1 changed files with 2 additions and 3 deletions
--- a/ldm/generate.py
+++ b/ldm/generate.py
@ -536,9 +536,6 @@ class Generate:
        sd = pl_sd['state_dict']
        model = instantiate_from_config(config.model)
        m, u = model.load_state_dict(sd, strict=False)
-        model.to(self.device)
-        model.eval()
-
        
        if self.full_precision:
            print(
@ -549,6 +546,8 @@ class Generate:
                '>> Using half precision math. Call with --full_precision to use more accurate but VRAM-intensive full precision.'
            )
            model.half()
+        model.to(self.device)
+        model.eval()

        # usage statistics
        toc = time.time()