Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion diffsynth_engine/conf/models/flux/flux_dit.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,5 +101,24 @@
"proj_mlp": "proj_in_besides_attn",
"proj_out": "proj_out"
}
}
},
"preferred_kontext_resolutions": [
[672, 1568],
[688, 1504],
[720, 1456],
[752, 1392],
[800, 1328],
[832, 1248],
[880, 1184],
[944, 1104],
[1024, 1024],
[1104, 944],
[1184, 880],
[1248, 832],
[1328, 800],
[1392, 752],
[1456, 720],
[1504, 688],
[1568, 672]
]
}
258 changes: 253 additions & 5 deletions diffsynth_engine/conf/models/flux/flux_vae.json

Large diffs are not rendered by default.

20 changes: 9 additions & 11 deletions diffsynth_engine/models/flux/flux_controlnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,18 +119,16 @@ def patchify(self, hidden_states):

def forward(
self,
hidden_states,
control_condition,
control_scale,
timestep,
prompt_emb,
pooled_prompt_emb,
guidance,
image_ids,
text_ids,
hidden_states: torch.Tensor,
control_condition: torch.Tensor,
control_scale: float,
timestep: torch.Tensor,
prompt_emb: torch.Tensor,
pooled_prompt_emb: torch.Tensor,
image_ids: torch.Tensor,
text_ids: torch.Tensor,
guidance: torch.Tensor,
):
hidden_states = self.patchify(hidden_states)
control_condition = self.patchify(control_condition)
hidden_states = self.x_embedder(hidden_states) + self.controlnet_x_embedder(control_condition)
condition = (
self.time_embedder(timestep, hidden_states.dtype)
Expand Down
38 changes: 17 additions & 21 deletions diffsynth_engine/models/flux/flux_dit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import torch
import torch.nn as nn
import numpy as np
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional
from einops import rearrange

from diffsynth_engine.models.basic.transformer_helper import (
Expand Down Expand Up @@ -245,7 +245,7 @@ def __init__(
self.ff_a = nn.Sequential(
nn.Linear(dim, dim * 4, device=device, dtype=dtype),
nn.GELU(approximate="tanh"),
nn.Linear(dim * 4, dim, device=device, dtype=dtype)
nn.Linear(dim * 4, dim, device=device, dtype=dtype),
)
# Text
self.norm_msa_b = AdaLayerNormZero(dim, device=device, dtype=dtype)
Expand Down Expand Up @@ -395,21 +395,19 @@ def prepare_image_ids(latents: torch.Tensor):

def forward(
self,
hidden_states,
timestep,
prompt_emb,
pooled_prompt_emb,
image_emb,
guidance,
text_ids,
image_ids=None,
controlnet_double_block_output=None,
controlnet_single_block_output=None,
hidden_states: torch.Tensor,
timestep: torch.Tensor,
prompt_emb: torch.Tensor,
pooled_prompt_emb: torch.Tensor,
image_ids: torch.Tensor,
text_ids: torch.Tensor,
guidance: torch.Tensor,
image_emb: torch.Tensor | None = None,
controlnet_double_block_output: List[torch.Tensor] | None = None,
controlnet_single_block_output: List[torch.Tensor] | None = None,
**kwargs,
):
h, w = hidden_states.shape[-2:]
if image_ids is None:
image_ids = self.prepare_image_ids(hidden_states)
image_seq_len = hidden_states.shape[1]
controlnet_double_block_output = (
controlnet_double_block_output if controlnet_double_block_output is not None else ()
)
Expand All @@ -428,10 +426,10 @@ def forward(
timestep,
prompt_emb,
pooled_prompt_emb,
image_emb,
guidance,
text_ids,
image_ids,
text_ids,
guidance,
image_emb,
*controlnet_double_block_output,
*controlnet_single_block_output,
),
Expand All @@ -448,7 +446,6 @@ def forward(
rope_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
text_rope_emb = rope_emb[:, :, : text_ids.size(1)]
image_rope_emb = rope_emb[:, :, text_ids.size(1) :]
hidden_states = self.patchify(hidden_states)

with sequence_parallel(
(
Expand Down Expand Up @@ -489,9 +486,8 @@ def forward(
hidden_states = hidden_states[:, prompt_emb.shape[1] :]
hidden_states = self.final_norm_out(hidden_states, conditioning)
hidden_states = self.final_proj_out(hidden_states)
(hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(h * w // 4,))
(hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(image_seq_len,))

hidden_states = self.unpatchify(hidden_states, h, w)
(hidden_states,) = cfg_parallel_unshard((hidden_states,), use_cfg=use_cfg)
return hidden_states

Expand Down
38 changes: 17 additions & 21 deletions diffsynth_engine/models/flux/flux_dit_fbcache.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
import numpy as np
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional

from diffsynth_engine.utils.gguf import gguf_inference
from diffsynth_engine.utils.fp8_linear import fp8_inference
Expand Down Expand Up @@ -48,21 +48,19 @@ def refresh_cache_status(self, num_inference_steps):

def forward(
self,
hidden_states,
timestep,
prompt_emb,
pooled_prompt_emb,
image_emb,
guidance,
text_ids,
image_ids=None,
controlnet_double_block_output=None,
controlnet_single_block_output=None,
hidden_states: torch.Tensor,
timestep: torch.Tensor,
prompt_emb: torch.Tensor,
pooled_prompt_emb: torch.Tensor,
image_ids: torch.Tensor,
text_ids: torch.Tensor,
guidance: torch.Tensor,
image_emb: torch.Tensor | None = None,
controlnet_double_block_output: List[torch.Tensor] | None = None,
controlnet_single_block_output: List[torch.Tensor] | None = None,
**kwargs,
):
h, w = hidden_states.shape[-2:]
if image_ids is None:
image_ids = self.prepare_image_ids(hidden_states)
image_seq_len = hidden_states.shape[1]
controlnet_double_block_output = (
controlnet_double_block_output if controlnet_double_block_output is not None else ()
)
Expand All @@ -81,10 +79,10 @@ def forward(
timestep,
prompt_emb,
pooled_prompt_emb,
image_emb,
guidance,
text_ids,
image_ids,
text_ids,
guidance,
image_emb,
*controlnet_double_block_output,
*controlnet_single_block_output,
),
Expand All @@ -101,7 +99,6 @@ def forward(
rope_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
text_rope_emb = rope_emb[:, :, : text_ids.size(1)]
image_rope_emb = rope_emb[:, :, text_ids.size(1) :]
hidden_states = self.patchify(hidden_states)

with sequence_parallel(
(
Expand Down Expand Up @@ -131,7 +128,7 @@ def forward(
first_hidden_states_residual = hidden_states - original_hidden_states

(first_hidden_states_residual,) = sequence_parallel_unshard(
(first_hidden_states_residual,), seq_dims=(1,), seq_lens=(h * w // 4,)
(first_hidden_states_residual,), seq_dims=(1,), seq_lens=(image_seq_len,)
)

if self.step_count == 0 or self.step_count == (self.num_inference_steps - 1):
Expand Down Expand Up @@ -172,9 +169,8 @@ def forward(

hidden_states = self.final_norm_out(hidden_states, conditioning)
hidden_states = self.final_proj_out(hidden_states)
(hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(h * w // 4,))
(hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(image_seq_len,))

hidden_states = self.unpatchify(hidden_states, h, w)
(hidden_states,) = cfg_parallel_unshard((hidden_states,), use_cfg=use_cfg)

return hidden_states
Expand Down
20 changes: 19 additions & 1 deletion diffsynth_engine/models/flux/flux_vae.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,29 @@ def _from_civitai(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.
new_state_dict[name_] = param
return new_state_dict

def _from_diffusers(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
rename_dict = config["diffusers"]["rename_dict"]
new_state_dict = {}
for name, param in state_dict.items():
if name not in rename_dict:
continue
name_ = rename_dict[name]
if "transformer_blocks" in name_:
param = param.squeeze()
new_state_dict[name_] = param
return new_state_dict

def convert(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
assert self.has_decoder or self.has_encoder, "Either decoder or encoder must be present"
if "decoder.conv_in.weight" in state_dict or "encoder.conv_in.weight" in state_dict:
if "decoder.up.0.block.0.conv1.weight" in state_dict or "encoder.down.0.block.0.conv1.weight" in state_dict:
state_dict = self._from_civitai(state_dict)
logger.info("use civitai format state dict")
elif (
"decoder.up_blocks.0.resnets.0.conv1.weight" in state_dict
or "encoder.down_blocks.0.resnets.0.conv1.weight" in state_dict
):
state_dict = self._from_diffusers(state_dict)
logger.info("use diffusers format state dict")
else:
logger.info("use diffsynth format state dict")
return self._filter(state_dict)
Expand Down
Loading