Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug]: Error when starting training #275

Open
supermachine77 opened this issue Apr 30, 2024 · 21 comments
Open

[Bug]: Error when starting training #275

supermachine77 opened this issue Apr 30, 2024 · 21 comments
Labels
bug Something isn't working

Comments

@supermachine77
Copy link

supermachine77 commented Apr 30, 2024

I'm getting an error each time I try to run the training process. For context, I'm using Windows 11 with an RTX 3090, and both A111 and ComfyUI work fine on my machine.

OneTrainer installs fine without any errors, but I get an error when I try to start the actual training process inside the GUI. I've tried reinstalling it but doesn't fix the issue. I've tried upgrading transformers from 4.36.2 to 4.40.1 (per the attached pip freeze log) but that doesn't seem to fix it either, so doubt the issue is related to that. Would really appreciate some guidance as I've tried searching past bug reports and also searched google, but can't seem to find anything concrete that helps.

Config

I'm using the following config:

{
    "__version": 3,
    "training_method": "FINE_TUNE",
    "model_type": "STABLE_DIFFUSION_XL_10_BASE",
    "debug_mode": false,
    "debug_dir": "debug",
    "workspace_dir": "Z:/AI_Image_Generation/OneTrainer/OneTrainerWorkspace",
    "cache_dir": "Z:/AI_Image_Generation/OneTrainer/OneTrainerWorkspace/cache",
    "tensorboard": false,
    "tensorboard_expose": false,
    "continue_last_backup": false,
    "include_train_config": "NONE",
    "base_model_name": "Z:/AI_Image_Generation/Stability Matrix_V2/Data/Models/StableDiffusion/RealVisXL_V4.0.safetensors",
    "weight_dtype": "BFLOAT_16",
    "output_dtype": "BFLOAT_16",
    "output_model_format": "SAFETENSORS",
    "output_model_destination": "Z:/AI_Image_Generation/OneTrainer/OneTrainerWorkspace/tier1_fast_GH.safetensors",
    "gradient_checkpointing": true,
    "force_circular_padding": false,
    "concept_file_name": "training_concepts/concepts.json",
    "concepts": null,
    "circular_mask_generation": false,
    "random_rotate_and_crop": false,
    "aspect_ratio_bucketing": false,
    "latent_caching": true,
    "clear_cache_before_training": false,
    "learning_rate_scheduler": "CONSTANT",
    "learning_rate": 1e-05,
    "learning_rate_warmup_steps": 200,
    "learning_rate_cycles": 1,
    "epochs": 250,
    "batch_size": 1,
    "gradient_accumulation_steps": 1,
    "ema": "OFF",
    "ema_decay": 0.999,
    "ema_update_step_interval": 1,
    "dataloader_threads": 2,
    "train_device": "cuda",
    "temp_device": "cpu",
    "train_dtype": "BFLOAT_16",
    "fallback_train_dtype": "FLOAT_32",
    "enable_autocast_cache": true,
    "only_cache": false,
    "resolution": "1024",
    "attention_mechanism": "DEFAULT",
    "align_prop": false,
    "align_prop_probability": 0.1,
    "align_prop_loss": "AESTHETIC",
    "align_prop_weight": 0.01,
    "align_prop_steps": 20,
    "align_prop_truncate_steps": 0.5,
    "align_prop_cfg_scale": 7.0,
    "mse_strength": 1.0,
    "mae_strength": 0.0,
    "vb_loss_strength": 1.0,
    "loss_weight_fn": "CONSTANT",
    "loss_weight_strength": 5.0,
    "dropout_probability": 0.0,
    "loss_scaler": "NONE",
    "learning_rate_scaler": "NONE",
    "offset_noise_weight": 0.0,
    "perturbation_noise_weight": 0.0,
    "rescale_noise_scheduler_to_zero_terminal_snr": false,
    "force_v_prediction": false,
    "force_epsilon_prediction": false,
    "min_noising_strength": 0.0,
    "max_noising_strength": 1.0,
    "noising_weight": 0.0,
    "noising_bias": 0.5,
    "unet": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": 10000,
        "stop_training_after_unit": "EPOCH",
        "learning_rate": 1e-05,
        "weight_dtype": "BFLOAT_16"
    },
    "prior": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": 10000,
        "stop_training_after_unit": "EPOCH",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "text_encoder": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": 250,
        "stop_training_after_unit": "NEVER",
        "learning_rate": 3e-06,
        "weight_dtype": "BFLOAT_16"
    },
    "text_encoder_layer_skip": 0,
    "text_encoder_2": {
        "__version": 0,
        "model_name": "",
        "train": false,
        "stop_training_after": 30,
        "stop_training_after_unit": "EPOCH",
        "learning_rate": null,
        "weight_dtype": "BFLOAT_16"
    },
    "text_encoder_2_layer_skip": 0,
    "vae": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "FLOAT_32"
    },
    "effnet_encoder": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "decoder": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "decoder_text_encoder": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "decoder_vqgan": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "masked_training": false,
    "unmasked_probability": 0.1,
    "unmasked_weight": 0.1,
    "normalize_masked_area_loss": false,
    "embedding_learning_rate": null,
    "preserve_embedding_norm": false,
    "embedding": {
        "__version": 0,
        "uuid": "f561c996-8a3f-4ff4-9b44-c819f14594f1",
        "model_name": "",
        "placeholder": "<embedding>",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "token_count": 1,
        "initial_embedding_text": "*"
    },
    "additional_embeddings": [],
    "embedding_weight_dtype": "FLOAT_32",
    "lora_model_name": "",
    "lora_rank": 16,
    "lora_alpha": 1.0,
    "lora_weight_dtype": "FLOAT_32",
    "optimizer": {
        "__version": 0,
        "optimizer": "ADAFACTOR",
        "adam_w_mode": false,
        "alpha": null,
        "amsgrad": false,
        "beta1": null,
        "beta2": null,
        "beta3": null,
        "bias_correction": false,
        "block_wise": false,
        "capturable": false,
        "centered": false,
        "clip_threshold": 1.0,
        "d0": null,
        "d_coef": null,
        "dampening": null,
        "decay_rate": -0.8,
        "decouple": false,
        "differentiable": false,
        "eps": 1e-30,
        "eps2": 0.001,
        "foreach": false,
        "fsdp_in_use": false,
        "fused": false,
        "fused_back_pass": false,
        "growth_rate": null,
        "initial_accumulator_value": null,
        "is_paged": false,
        "log_every": null,
        "lr_decay": null,
        "max_unorm": null,
        "maximize": false,
        "min_8bit_size": null,
        "momentum": null,
        "nesterov": false,
        "no_prox": false,
        "optim_bits": null,
        "percentile_clipping": null,
        "relative_step": false,
        "safeguard_warmup": false,
        "scale_parameter": false,
        "stochastic_rounding": false,
        "use_bias_correction": false,
        "use_triton": false,
        "warmup_init": false,
        "weight_decay": 0.01
    },
    "optimizer_defaults": {
        "ADAFACTOR": {
            "__version": 0,
            "optimizer": "ADAFACTOR",
            "adam_w_mode": false,
            "alpha": null,
            "amsgrad": false,
            "beta1": null,
            "beta2": null,
            "beta3": null,
            "bias_correction": false,
            "block_wise": false,
            "capturable": false,
            "centered": false,
            "clip_threshold": 1.0,
            "d0": null,
            "d_coef": null,
            "dampening": null,
            "decay_rate": -0.8,
            "decouple": false,
            "differentiable": false,
            "eps": 1e-30,
            "eps2": 0.001,
            "foreach": false,
            "fsdp_in_use": false,
            "fused": false,
            "fused_back_pass": false,
            "growth_rate": null,
            "initial_accumulator_value": null,
            "is_paged": false,
            "log_every": null,
            "lr_decay": null,
            "max_unorm": null,
            "maximize": false,
            "min_8bit_size": null,
            "momentum": null,
            "nesterov": false,
            "no_prox": false,
            "optim_bits": null,
            "percentile_clipping": null,
            "relative_step": false,
            "safeguard_warmup": false,
            "scale_parameter": false,
            "stochastic_rounding": false,
            "use_bias_correction": false,
            "use_triton": false,
            "warmup_init": false,
            "weight_decay": 0.01
        }
    },
    "sample_definition_file_name": "training_samples/samples.json",
    "samples": null,
    "sample_after": 10,
    "sample_after_unit": "MINUTE",
    "sample_image_format": "JPG",
    "samples_to_tensorboard": false,
    "non_ema_sampling": false,
    "backup_after": 30,
    "backup_after_unit": "NEVER",
    "rolling_backup": false,
    "rolling_backup_count": 3,
    "backup_before_save": false,
    "save_after": 25,
    "save_after_unit": "EPOCH",
    "save_filename_prefix": "gh_tier1_fast"
}

Error log output

activating venv Z:\AI_Image_Generation\OneTrainer\venv
Using Python "Z:\AI_Image_Generation\OneTrainer\venv\Scripts\python.exe"
Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\utils\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
  _torch_pytree._register_pytree_node(
Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\utils\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
  _torch_pytree._register_pytree_node(
Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 189, in load
    self.__load_internal(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 26, in __load_internal
    self.__load_diffusers(model, model_type, weight_dtypes, base_model_name, vae_model_name)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 36, in __load_diffusers
    tokenizer_1 = CLIPTokenizer.from_pretrained(
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
    raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 195, in load
    self.__load_diffusers(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 36, in __load_diffusers
    tokenizer_1 = CLIPTokenizer.from_pretrained(
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
    raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1479, in create_text_encoders_and_tokenizers_from_ldm
    tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1970, in from_pretrained
    tokenizer_config = json.load(reader)
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 293, in load
    return loads(fp.read(),
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 201, in load
    self.__load_safetensors(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 145, in __load_safetensors
    pipeline = StableDiffusionXLPipeline.from_single_file(
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\huggingface_hub\utils\_validators.py", line 118, in _inner_fn
    return fn(*args, **kwargs)
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 289, in from_single_file
    components = build_sub_model_components(
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 102, in build_sub_model_components
    text_encoder_components = create_text_encoders_and_tokenizers_from_ldm(
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1485, in create_text_encoders_and_tokenizers_from_ldm
    raise ValueError(
ValueError: With local_files_only set to False, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'.

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1479, in create_text_encoders_and_tokenizers_from_ldm
    tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1970, in from_pretrained
    tokenizer_config = json.load(reader)
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 293, in load
    return loads(fp.read(),
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 207, in load
    self.__load_ckpt(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 104, in __load_ckpt
    pipeline = StableDiffusionXLPipeline.from_single_file(
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\huggingface_hub\utils\_validators.py", line 118, in _inner_fn
    return fn(*args, **kwargs)
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 289, in from_single_file
    components = build_sub_model_components(
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 102, in build_sub_model_components
    text_encoder_components = create_text_encoders_and_tokenizers_from_ldm(
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1485, in create_text_encoders_and_tokenizers_from_ldm
    raise ValueError(
ValueError: With local_files_only set to False, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'.

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\ui\TrainUI.py", line 522, in __training_thread_function
    trainer.start()
  File "Z:\AI_Image_Generation\OneTrainer\modules\trainer\GenericTrainer.py", line 113, in start
    self.model = self.model_loader.load(
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\StableDiffusionXLFineTuneModelLoader.py", line 62, in load
    base_model_loader.load(model, model_type, model_names, weight_dtypes)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 214, in load
    raise Exception("could not load model: " + model_names.base_model)
Exception: could not load model: Z:/AI_Image_Generation/OneTrainer/RealVisXL_V4.0.safetensors

Output of pip freeze

absl-py==2.1.0
accelerate==0.27.2
aiofiles==23.2.1
aiohttp==3.9.5
aiosignal==1.3.1
altair==5.3.0
annotated-types==0.6.0
annoy-fixed==1.16.3
antlr4-python3-runtime==4.9.3
anyio==4.3.0
appdirs==1.4.4
astunparse==1.6.3
async-timeout==4.0.3
attrs==23.2.0
beautifulsoup4==4.12.2
bidict==0.23.1
bitsandbytes==0.43.0
blinker==1.7.0
braceexpand==0.1.7
cachetools==5.3.3
certifi==2024.2.2
charset-normalizer==3.3.2
clean-fid==0.1.35
click==8.1.7
clip-anytorch==2.6.0
colorama==0.4.6
coloredlogs==15.0.1
contourpy==1.2.1
cycler==0.12.1
dadaptation==3.1
dctorch==0.1.2
decorator==4.4.2
diffusers==0.27.2
distlib==0.3.8
distro==1.9.0
docker-pycreds==0.4.0
easydict==1.10
easygui==0.98.3
einops==0.7.0
einops-exts==0.0.4
entrypoints==0.4
exceptiongroup==1.2.1
face-alignment==1.4.1
facexlib==0.3.0
fairscale==0.4.13
faiss-cpu==1.7.4
fastapi==0.110.2
ffmpeg-progress-yield==0.7.8
ffmpy==0.3.2
filelock==3.13.4
filetype==1.2.0
filterpy==1.4.5
Flask==2.3.2
Flask-SocketIO==5.3.4
flatbuffers==24.3.25
fonttools==4.51.0
frozenlist==1.4.1
fsspec==2024.3.1
ftfy==6.2.0
gast==0.5.4
gitdb==4.0.11
GitPython==3.1.43
google-pasta==0.2.0
gradio==4.19.0
gradio_client==0.10.0
gradio_imageslider==0.0.20
grpcio==1.62.2
h11==0.14.0
h5py==3.11.0
httpcore==1.0.5
httpx==0.27.0
huggingface-hub==0.22.2
humanfriendly==10.0
idna==3.7
imageio==2.34.1
imageio-ffmpeg==0.4.9
imagesize==1.4.1
importlib_metadata==7.1.0
importlib_resources==6.4.0
invisible-watermark==0.2.0
itsdangerous==2.1.2
Jinja2==3.1.3
joblib==1.4.0
jsonmerge==1.9.2
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
k-diffusion==0.1.1.post1
keras==3.2.1
kiwisolver==1.4.5
kornia==0.7.1
lazy_loader==0.4
libclang==18.1.1
lightning-utilities==0.11.2
lion-pytorch==0.0.6
llvmlite==0.42.0
lycoris_lora==2.2.0.post3
Markdown==3.5.2
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.8.3
mdurl==0.1.2
ml-dtypes==0.3.2
moviepy==1.0.3
mpmath==1.3.0
multidict==6.0.5
namex==0.0.8
networkx==3.3
ninja==1.11.1.1
numba==0.59.1
numpy==1.26.4
nvidia-ml-py==12.535.161
nvitop==1.3.2
omegaconf==2.3.0
onnx==1.15.0
onnxruntime-gpu==1.17.1
open-clip-torch==2.24.0
openai==1.3.3
openai-clip==1.0.1
opencv-python==4.9.0.80
opt-einsum==3.3.0
optree==0.11.0
orjson==3.10.1
packaging==24.0
pandas==2.2.1
pathtools==0.1.2
pillow==10.2.0
platformdirs==3.11.0
prodigyopt==1.0
proglog==0.1.10
protobuf==4.25.3
psutil==5.9.8
pydantic==2.7.0
pydantic_core==2.18.1
pydub==0.25.1
Pygments==2.17.2
pyparsing==3.1.2
pypdfium2==4.27.0
pyreadline3==3.4.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-engineio==4.9.0
python-multipart==0.0.9
python-socketio==5.11.2
pytorch-lightning==2.2.2
pytz==2024.1
PyWavelets==1.6.0
PyYAML==6.0.1
referencing==0.34.0
regex==2024.4.16
requests==2.31.0
rich==13.7.1
rpds-py==0.18.0
ruff==0.4.1
safetensors==0.4.3
scikit-image==0.23.2
scikit-learn==1.4.1.post1
scipy==1.12.0
semantic-version==2.10.0
semantra==0.1.8
sentencepiece==0.2.0
sentry-sdk==1.45.0
setproctitle==1.3.3
shellingham==1.5.4
simple-websocket==1.0.0
six==1.16.0
smmap==5.0.1
sniffio==1.3.1
soupsieve==2.5
starlette==0.37.2
sympy==1.12
tenacity==8.2.2
tensorboard==2.16.2
tensorboard-data-server==0.7.2
tensorflow==2.16.1
tensorflow-intel==2.16.1
tensorflow-io-gcs-filesystem==0.31.0
termcolor==2.4.0
threadpoolctl==3.4.0
tifffile==2024.4.18
tiktoken==0.4.0
timm==0.9.16
tk==0.1.0
tokenizers==0.19.1
toml==0.10.2
tomlkit==0.12.0
toolz==0.12.1
torch==2.2.0+cu121
torchaudio==2.2.0+cu121
torchdiffeq==0.2.3
torchmetrics==1.3.2
torchsde==0.2.6
torchvision==0.17.0+cu121
tqdm==4.66.2
trampoline==0.1.2
transformers==4.40.1
triton @ https://huggingface.co/MonsterMMORPG/SECourses/resolve/main/triton-2.1.0-cp310-cp310-win_amd64.whl
typer==0.12.3
typing_extensions==4.11.0
tzdata==2024.1
urllib3==2.2.1
uvicorn==0.28.0
virtualenv==20.23.0
voluptuous==0.13.1
wandb==0.16.4
wcwidth==0.2.13
webdataset==0.2.86
websockets==11.0.3
Werkzeug==2.3.6
Wikipedia-API==0.6.0
windows-curses==2.3.2
wrapt==1.16.0
wsproto==1.2.0
xformers==0.0.24
yarl==1.9.4
zipp==3.18.1

@supermachine77 supermachine77 added the bug Something isn't working label Apr 30, 2024
@mx
Copy link
Collaborator

mx commented Apr 30, 2024

Your diffusers version looks wrong. The output of "pip freeze" in your venv should be something like:

-e git+https://github.com/huggingface/diffusers.git@5d848ec#egg=diffusers

How did you actually install OneTrainer?

@supermachine77
Copy link
Author

supermachine77 commented Apr 30, 2024

Wow that was a fast reply, thanks for that!

I installed it by cloning the repository (git clone https://github.com/Nerogar/OneTrainer.git) and then running the install.bat file. I've also tried installing it by a 1-click installer (StabilityMatrix) and same issue arrises.

The requirements-global.txt file does state the same line you've shared ("-e git+https://github.com/huggingface/diffusers.git@5d848ec#egg=diffusers"), but I'm not quite sure how to fix this as each time I install it seems to end up as "diffusers==0.27.2" in pip freeze. Any thoughts?

EDIT:

Nevermind, managed to install the diffusers the proper way (in case anyone's got the same issue, I just activated the venv in the OneTrainer folder, ran "pip uninstall diffusers", then ran "pip install -e git+https://github.com/huggingface/diffusers.git@5d848ec#egg=diffusers".

Although this fixes the pip freeze (which now contains "-e git+https://github.com/huggingface/diffusers.git@5d848ec#egg=diffusers" instead of "diffusers==0.27.2", I still get a similar error when I try to run the training process:

Error log

activating venv Z:\AI_Image_Generation\OneTrainer\venv
Using Python "Z:\AI_Image_Generation\OneTrainer\venv\Scripts\python.exe"
Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\utils\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
  _torch_pytree._register_pytree_node(
Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\utils\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
  _torch_pytree._register_pytree_node(
Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 189, in load
    self.__load_internal(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 26, in __load_internal
    self.__load_diffusers(model, model_type, weight_dtypes, base_model_name, vae_model_name)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 36, in __load_diffusers
    tokenizer_1 = CLIPTokenizer.from_pretrained(
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
    raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 195, in load
    self.__load_diffusers(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 36, in __load_diffusers
    tokenizer_1 = CLIPTokenizer.from_pretrained(
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
    raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1479, in create_text_encoders_and_tokenizers_from_ldm
    tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1970, in from_pretrained
    tokenizer_config = json.load(reader)
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 293, in load
    return loads(fp.read(),
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 201, in load
    self.__load_safetensors(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 145, in __load_safetensors
    pipeline = StableDiffusionXLPipeline.from_single_file(
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\huggingface_hub\utils\_validators.py", line 118, in _inner_fn
    return fn(*args, **kwargs)
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 289, in from_single_file
    components = build_sub_model_components(
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 102, in build_sub_model_components
    text_encoder_components = create_text_encoders_and_tokenizers_from_ldm(
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1485, in create_text_encoders_and_tokenizers_from_ldm
    raise ValueError(
ValueError: With local_files_only set to False, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'.

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1479, in create_text_encoders_and_tokenizers_from_ldm
    tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1970, in from_pretrained
    tokenizer_config = json.load(reader)
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 293, in load
    return loads(fp.read(),
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 207, in load
    self.__load_ckpt(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 104, in __load_ckpt
    pipeline = StableDiffusionXLPipeline.from_single_file(
  File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\huggingface_hub\utils\_validators.py", line 118, in _inner_fn
    return fn(*args, **kwargs)
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 289, in from_single_file
    components = build_sub_model_components(
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 102, in build_sub_model_components
    text_encoder_components = create_text_encoders_and_tokenizers_from_ldm(
  File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1485, in create_text_encoders_and_tokenizers_from_ldm
    raise ValueError(
ValueError: With local_files_only set to False, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'.

Traceback (most recent call last):
  File "Z:\AI_Image_Generation\OneTrainer\modules\ui\TrainUI.py", line 522, in __training_thread_function
    trainer.start()
  File "Z:\AI_Image_Generation\OneTrainer\modules\trainer\GenericTrainer.py", line 113, in start
    self.model = self.model_loader.load(
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\StableDiffusionXLFineTuneModelLoader.py", line 62, in load
    base_model_loader.load(model, model_type, model_names, weight_dtypes)
  File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 214, in load
    raise Exception("could not load model: " + model_names.base_model)
Exception: could not load model: Z:/AI_Image_Generation/Stability Matrix Backup/Data/Models/StableDiffusion/realvisxlV40_v40Bakedvae.safetensors

@mx
Copy link
Collaborator

mx commented Apr 30, 2024

The "pip freeze" still looks suspect to me. Is that the pip freeze from inside the venv? I'm seeing stuff there that wouldn't have been installed by the requirements.txt, like triton on Windows.

@mx
Copy link
Collaborator

mx commented Apr 30, 2024

For reference, this is my own "venv\scripts\pip.exe freeze". Note the different torch version, which might also be your problem:

absl-py==2.1.0
accelerate==0.25.0
aiohttp==3.9.5
aiosignal==1.3.1
anndata==0.10.6
antlr4-python3-runtime==4.9.3
array_api_compat==1.5.1
async-timeout==4.0.3
attrs==23.2.0
bitsandbytes==0.43.0
cachetools==5.3.3
came==0.1.13
came-pytorch==0.1.3
certifi==2024.2.2
charset-normalizer==3.3.2
cloudpickle==3.0.0
colorama==0.4.6
coloredlogs==15.0.1
contourpy==1.2.0
customtkinter==5.2.1
cycler==0.12.1
dadaptation==3.2
darkdetect==0.8.0
dgl==2.0.0
-e git+https://github.com/huggingface/diffusers.git@5d848ec07c2011d600ce5e5c1aa02a03152aea9b#egg=diffusers
exceptiongroup==1.2.0
filelock==3.13.4
flatbuffers==24.3.25
fonttools==4.50.0
frozenlist==1.4.1
fsspec==2024.3.1
ftfy==6.2.0
google-auth==2.29.0
google-auth-oauthlib==1.2.0
grpcio==1.62.2
h5py==3.10.0
huggingface-hub==0.20.3
humanfriendly==10.0
idna==3.7
importlib_metadata==7.1.0
invisible-watermark==0.2.0
Jinja2==3.1.3
joblib==1.3.2
kiwisolver==1.4.5
lightning-utilities==0.11.2
lion-pytorch==0.1.2
llvmlite==0.42.0
Markdown==3.6
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.8.3
mdurl==0.1.2
-e git+https://github.com/Nerogar/mgds.git@1dc300967e75b6fa0fb4b72587f3df08a8278efd#egg=mgds
mpmath==1.3.0
multidict==6.0.5
natsort==8.4.0
networkx==3.3
numba==0.59.1
numpy==1.26.2
oauthlib==3.2.2
omegaconf==2.3.0
onnxruntime-gpu==1.16.3
open-clip-torch==2.23.0
opencv-python==4.8.1.78
packaging==24.0
pandas==2.2.1
patsy==0.5.6
pillow==10.2.0
platformdirs==4.2.0
pooch==1.8.0
prodigyopt==1.0
protobuf==4.23.4
psutil==5.9.8
py-spy==0.3.14
pyasn1==0.6.0
pyasn1_modules==0.4.0
Pygments==2.17.2
pynndescent==0.5.11
pynvml==11.5.0
pyparsing==3.1.2
pyreadline3==3.4.1
python-dateutil==2.9.0.post0
pytorch-lightning==2.1.3
pytz==2024.1
PyWavelets==1.6.0
PyYAML==6.0.1
regex==2024.4.16
requests==2.31.0
requests-oauthlib==2.0.0
rich==13.7.1
rsa==4.9
safetensors==0.4.1
scalene==1.5.39
scanpy==1.9.8
schedulefree==1.2.1
scikit-learn==1.4.1.post1
scipy==1.12.0
seaborn==0.13.2
sentencepiece==0.2.0
session_info==1.0.0
six==1.16.0
statsmodels==0.14.1
stdlib-list==0.10.0
sympy==1.12
tensorboard==2.15.1
tensorboard-data-server==0.7.2
threadpoolctl==3.4.0
timm==0.9.16
tokenizers==0.15.2
torch==2.2.0+cu118
torchdata==0.7.1
torchmetrics==1.3.2
torchvision==0.17.0+cu118
tqdm==4.66.1
transformers==4.36.2
typing_extensions==4.11.0
tzdata==2024.1
umap-learn==0.5.5
urllib3==2.2.1
wcwidth==0.2.13
Werkzeug==3.0.2
xformers==0.0.24+cu118
yarl==1.9.4
zipp==3.18.1

@supermachine77
Copy link
Author

Again, thanks for the reply mx, appreciate the help!

Sorry, I did take the pip freeze in my first post from the incorrect place. I took it again in the correct location and had a bunch of libraries missing compared to your list, so I've just installed the missing ones manually. My new list is now of that below, and it matches yours exactly.

The same error still persists though. Short of doing a full windows reinstall, is there anything else I could do?

(venv) Z:\AI_Image_Generation\OneTrainer\venv>scripts\pip.exe freeze
absl-py==2.1.0
accelerate==0.25.0
aiohttp==3.9.5
aiosignal==1.3.1
anndata==0.10.6
antlr4-python3-runtime==4.9.3
array_api_compat==1.5.1
async-timeout==4.0.3
attrs==23.2.0
bitsandbytes==0.43.0
cachetools==5.3.3
came==0.1.13
came-pytorch==0.1.3
certifi==2024.2.2
charset-normalizer==3.3.2
cloudpickle==3.0.0
colorama==0.4.6
coloredlogs==15.0.1
contourpy==1.2.0
customtkinter==5.2.1
cycler==0.12.1
dadaptation==3.2
darkdetect==0.8.0
dgl==2.0.0
-e git+https://github.com/huggingface/diffusers.git@5d848ec07c2011d600ce5e5c1aa02a03152aea9b#egg=diffusers
exceptiongroup==1.2.0
filelock==3.13.4
flatbuffers==24.3.25
fonttools==4.50.0
frozenlist==1.4.1
fsspec==2024.3.1
ftfy==6.2.0
google-auth==2.29.0
google-auth-oauthlib==1.2.0
grpcio==1.62.2
h5py==3.10.0
huggingface-hub==0.20.3
humanfriendly==10.0
idna==3.7
importlib_metadata==7.1.0
invisible-watermark==0.2.0
Jinja2==3.1.3
joblib==1.3.2
kiwisolver==1.4.5
lightning-utilities==0.11.2
lion-pytorch==0.1.2
llvmlite==0.42.0
Markdown==3.6
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.8.3
mdurl==0.1.2
-e git+https://github.com/Nerogar/mgds.git@1dc300967e75b6fa0fb4b72587f3df08a8278efd#egg=mgds
mpmath==1.3.0
multidict==6.0.5
natsort==8.4.0
networkx==3.3
numba==0.59.1
numpy==1.26.2
oauthlib==3.2.2
omegaconf==2.3.0
onnxruntime-gpu==1.16.3
open-clip-torch==2.23.0
opencv-python==4.8.1.78
packaging==24.0
pandas==2.2.1
patsy==0.5.6
pillow==10.2.0
platformdirs==4.2.0
pooch==1.8.0
prodigyopt==1.0
protobuf==4.23.4
psutil==5.9.8
py-spy==0.3.14
pyasn1==0.6.0
pyasn1_modules==0.4.0
Pygments==2.17.2
pynndescent==0.5.11
pynvml==11.5.0
pyparsing==3.1.2
pyreadline3==3.4.1
python-dateutil==2.9.0.post0
pytorch-lightning==2.1.3
pytz==2024.1
PyWavelets==1.6.0
PyYAML==6.0.1
regex==2024.4.16
requests==2.31.0
requests-oauthlib==2.0.0
rich==13.7.1
rsa==4.9
safetensors==0.4.1
scalene==1.5.39
scanpy==1.9.8
schedulefree==1.2.1
scikit-learn==1.4.1.post1
scipy==1.12.0
seaborn==0.13.2
sentencepiece==0.2.0
session-info==1.0.0
six==1.16.0
statsmodels==0.14.1
stdlib-list==0.10.0
sympy==1.12
tensorboard==2.15.1
tensorboard-data-server==0.7.2
threadpoolctl==3.4.0
timm==0.9.16
tokenizers==0.15.2
torch==2.2.0+cu118
torchdata==0.7.1
torchmetrics==1.3.2
torchvision==0.17.0+cu118
tqdm==4.66.1
transformers==4.36.2
typing_extensions==4.11.0
tzdata==2024.1
umap-learn==0.5.5
urllib3==2.2.1
wcwidth==0.2.13
Werkzeug==3.0.2
xformers==0.0.24+cu118
yarl==1.9.4
zipp==3.18.1

@mx
Copy link
Collaborator

mx commented Apr 30, 2024

Can you try using a different checkpoint? I'm wondering if what's happening here is just that the checkpoint you are using got corrupted somewhere along the line and now it just can't be loaded.

@supermachine77
Copy link
Author

Just tried 3 other SDXL models, exact same error message (just with the updated model names),

@mx
Copy link
Collaborator

mx commented Apr 30, 2024

I'm honestly not sure what's going off here. I haven't seen this with any other person using SDXL; your config has no obvious red flags to me. The only place you've specified the model is in the correct place. Can you upload the very latest config.json you're using (made by clicking the "Export" button) from the latest SDXL model you tried? I doubt it, but maybe looking at two different configs will make something click.

@supermachine77
Copy link
Author

Config file exported from the GUI here:
config.json

@mookiexl
Copy link

mookiexl commented May 1, 2024

I'm having the same issue.

It happens with every custom checkpoint, and it started happening after the commit 52520c6 ("Merge branch 'universal_embeddings'", 2024-04-16)

It seems like it's trying to pull v1-inference.yaml from github over and over (instead of using the local model_config like it used to), and if it can't get it it'll crash with this error.

Traceback (most recent call last):
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 219, in load
    self.__load_internal(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 28, in __load_internal
    self.__load_diffusers(model, model_type, weight_dtypes, base_model_name, vae_model_name)
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 38, in __load_diffusers
    tokenizer = CLIPTokenizer.from_pretrained(
  File "D:\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
    raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.

Traceback (most recent call last):
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 225, in load
    self.__load_diffusers(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 38, in __load_diffusers
    tokenizer = CLIPTokenizer.from_pretrained(
  File "D:\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
    raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.

Traceback (most recent call last):
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\connection.py", line 198, in _new_conn
    sock = connection.create_connection(
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
    raise err
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\util\connection.py", line 73, in create_connection
    sock.connect(sa)
PermissionError: [WinError 10013] An attempt was made to access a socket in a way forbidden by its access permissions

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\connectionpool.py", line 793, in urlopen
    response = self._make_request(
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\connectionpool.py", line 491, in _make_request
    raise new_e
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\connectionpool.py", line 467, in _make_request
    self._validate_conn(conn)
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\connectionpool.py", line 1099, in _validate_conn
    conn.connect()
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\connection.py", line 616, in connect
    self.sock = sock = self._new_conn()
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\connection.py", line 213, in _new_conn
    raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x000002731BD28BE0>: Failed to establish a new connection: [WinError 10013] An attempt was made to access a socket in a way forbidden by its access permissions

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "D:\OneTrainer\venv\lib\site-packages\requests\adapters.py", line 486, in send
    resp = conn.urlopen(
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\connectionpool.py", line 847, in urlopen
    retries = retries.increment(
  File "D:\OneTrainer\venv\lib\site-packages\urllib3\util\retry.py", line 515, in increment
    raise MaxRetryError(_pool, url, reason) from reason  # type: ignore[arg-type]
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002731BD28BE0>: Failed to establish a new connection: [WinError 10013] An attempt was made to access a socket in a way forbidden by its access permissions'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 231, in load
    self.__load_safetensors(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 176, in __load_safetensors
    pipeline = download_from_original_stable_diffusion_ckpt(
  File "D:\OneTrainer\venv\src\diffusers\src\diffusers\pipelines\stable_diffusion\convert_from_ckpt.py", line 1319, in download_from_original_stable_diffusion_ckpt
    original_config_file = BytesIO(requests.get(config_url).content)
  File "D:\OneTrainer\venv\lib\site-packages\requests\api.py", line 73, in get
    return request("get", url, params=params, **kwargs)
  File "D:\OneTrainer\venv\lib\site-packages\requests\api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
  File "D:\OneTrainer\venv\lib\site-packages\requests\sessions.py", line 589, in request
    resp = self.send(prep, **send_kwargs)
  File "D:\OneTrainer\venv\lib\site-packages\requests\sessions.py", line 703, in send
    r = adapter.send(request, **kwargs)
  File "D:\OneTrainer\venv\lib\site-packages\requests\adapters.py", line 519, in send
    raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002731BD28BE0>: Failed to establish a new connection: [WinError 10013] An attempt was made to access a socket in a way forbidden by its access permissions'))

Traceback (most recent call last):
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 237, in load
    self.__load_ckpt(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 121, in __load_ckpt
    state_dict = torch.load(base_model_name)
  File "D:\OneTrainer\venv\lib\site-packages\torch\serialization.py", line 1040, in load
    return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
  File "D:\OneTrainer\venv\lib\site-packages\torch\serialization.py", line 1258, in _legacy_load
    magic_number = pickle_module.load(f, **pickle_load_args)
_pickle.UnpicklingError: invalid load key, ' '.

Traceback (most recent call last):
  File "D:\OneTrainer\modules\ui\TrainUI.py", line 522, in __training_thread_function
    trainer.start()
  File "D:\OneTrainer\modules\trainer\GenericTrainer.py", line 113, in start
    self.model = self.model_loader.load(
  File "D:\OneTrainer\modules\modelLoader\StableDiffusionLoRAModelLoader.py", line 64, in load
    base_model_loader.load(model, model_type, model_names, weight_dtypes)
  File "D:\OneTrainer\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 244, in load
    raise Exception("could not load model: " + model_names.base_model)
Exception: could not load model: D:\Program Files\Krita (x64)\share\krita\pykrita\ai_diffusion\.server\ComfyUI\models\checkpoints\dreamshaper_8.safetensors

I was able to temporarily get it working by disabling the firewall, but it crashes again every time you start training without giving OneTrainer unlimited internet access.

I'm not 100% sure it's the exact same issue, but it does seem to be related.

@mx
Copy link
Collaborator

mx commented May 1, 2024

Nice debugging, and nice find! Talk about an unusual combination of circumstances. Fix forthcoming.

@Nerogar
Copy link
Owner

Nerogar commented May 1, 2024

@supermachine77 can OneTrainer access the internet on your machine? This might not be the exact same issue, but it could be related. And do you know if the issue existed before, or is it something new?

I've analyzed this a bit, and I'm a bit confused. The safetensors files don't include a tokenizer. So to load them, a tokenizer needs to be downloaded from the internet first. By default, it's trying to use the one from huggingface called "openai/clip-vit-large-patch14". If you already downloaded that into your huggingface cache, it will use the cached version. I don't think this has ever worked in the past without having at least some kind of internet connection.

@Nerogar
Copy link
Owner

Nerogar commented May 1, 2024

@mookiexl I couldn't reproduce your problem. It's always loading the local v1-inference.yaml file. Can you attach a debugger, or put some print statements in your code? Specifically, I'm interested in model.sd_config_filename just before the call to base_model_loader.load in StableDiffusionFineTuneModelLoader.py, line 86

If this is None, it will cause original_config_file to be None in the download_from_original_stable_diffusion_ckpt. As a fallback this function then tries to download https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml

@mookiexl
Copy link

mookiexl commented May 1, 2024

Ok, so apparently it happens only when training a LoRA, finetune works.

Clean install from scratch, clean VENV, changed preset to "SD 1.5 LoRA", everything else left at default.

-Using default runwayml/stable-diffusion-v1-5, firewall disabled: works.
-Firewall enabled, runwayml/stable-diffusion-v1-5 still works.
-Changed base model to Dreamshaper_8.safetensors (or anything else) - CRASH.

model.sd_config_filename is "None" (in StableDiffusionLoRAModelLoader.py)

@Nerogar
Copy link
Owner

Nerogar commented May 1, 2024

Ok, so apparently it happens only when training a LoRA, finetune works.

With LoRA training I can reproduce the issue. Let me think about a solution.

@mookiexl
Copy link

mookiexl commented May 1, 2024

Just noticed that @supermachine77 seems to be doing finetuning, based on "training_method": "FINE_TUNE", so there could also be another separate problem that also leads to the same "ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url ..." error?

@Nerogar
Copy link
Owner

Nerogar commented May 1, 2024

@mookiexl your issue should be solved now

@mookiexl
Copy link

mookiexl commented May 1, 2024

Yes, it seems to work for me now. Thanks.

@mx
Copy link
Collaborator

mx commented May 3, 2024

@supermachine77 Did the above fixes resolve your issue? Does it still happen at the latest HEAD?

@supermachine77
Copy link
Author

supermachine77 commented May 5, 2024

@mx - will test it and let you know later today, fingers crossed it's resolved!

Update

Pulled the latest update (via the update.bat file) and unfortunately the exact same error message. I've also tried a fresh pull of the latest version into a new folder and no luck there either. Regarding internet access, yup, OneTrainer should be able to access it fine. I've tried running it with my firewall turned off but exact same issue. Anything else I could try sharing to make the debugging process a little easier?

@Chippd
Copy link

Chippd commented May 28, 2024

Found this thread as I'm having the same issue with loading models.
I'm trying to train a LoRA on sd1.5, and for me I can load non-inpainting models fine, but inpainting ones will give me the error.
Exception: could not load model: C:/Users/Chris/Downloads/realisticVisionV60B1_v51HyperInpaintVAE.safetensors
Models that fail (inpainting)

Models that work (non-inpainting)

Does this look like the same issue or something different?
Error log

Traceback (most recent call last):
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 249, in load
    self.__load_internal(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 55, in __load_internal
    self.__load_diffusers(model, model_type, weight_dtypes, base_model_name, vae_model_name)
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 65, in __load_diffusers
    tokenizer = CLIPTokenizer.from_pretrained(
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
    raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.

Traceback (most recent call last):
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 255, in load
    self.__load_diffusers(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 65, in __load_diffusers
    tokenizer = CLIPTokenizer.from_pretrained(
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
    raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.

Traceback (most recent call last):
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 261, in load
    self.__load_safetensors(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 203, in __load_safetensors
    pipeline = download_from_original_stable_diffusion_ckpt(
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\venv\src\diffusers\src\diffusers\pipelines\stable_diffusion\convert_from_ckpt.py", line 1472, in download_from_original_stable_diffusion_ckpt
    set_module_tensor_to_device(unet, param_name, "cpu", value=param)
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\venv\lib\site-packages\accelerate\utils\modeling.py", line 285, in set_module_tensor_to_device
    raise ValueError(
ValueError: Trying to set a tensor of shape torch.Size([320, 9, 3, 3]) in "weight" (which has shape torch.Size([320, 4, 3, 3])), this look incorrect.

Traceback (most recent call last):
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 267, in load
    self.__load_ckpt(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 148, in __load_ckpt
    state_dict = torch.load(base_model_name)
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\venv\lib\site-packages\torch\serialization.py", line 1040, in load
    return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\venv\lib\site-packages\torch\serialization.py", line 1258, in _legacy_load
    magic_number = pickle_module.load(f, **pickle_load_args)
_pickle.UnpicklingError: invalid load key, '\x00'.

Traceback (most recent call last):
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\ui\TrainUI.py", line 517, in __training_thread_function
    trainer.start()
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\trainer\GenericTrainer.py", line 113, in start
    self.model = self.model_loader.load(
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\StableDiffusionLoRAModelLoader.py", line 60, in load
    base_model_loader.load(model, model_type, model_names, weight_dtypes)
  File "C:\Users\Chris\Documents\AI tools\OneTrainer-master\modules\modelLoader\stableDiffusion\StableDiffusionModelLoader.py", line 274, in load
    raise Exception("could not load model: " + model_names.base_model)
Exception: could not load model: C:/Users/Chris/Downloads/realisticVisionV60B1_v51HyperInpaintVAE.safetensors

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

5 participants