From 41ba68be0e80aaa89f54470b7c23f38517b725da Mon Sep 17 00:00:00 2001 From: sunny Date: Thu, 23 Apr 2026 21:47:12 +0800 Subject: [PATCH 1/5] add the configs for qwen3-vl-8b-instruct model --- configs/qwen3-vl-8b-instruct-eagle3.json | 41 ++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 configs/qwen3-vl-8b-instruct-eagle3.json diff --git a/configs/qwen3-vl-8b-instruct-eagle3.json b/configs/qwen3-vl-8b-instruct-eagle3.json new file mode 100644 index 00000000..df60f887 --- /dev/null +++ b/configs/qwen3-vl-8b-instruct-eagle3.json @@ -0,0 +1,41 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "image_token_id": 151655, + "model_type": "llama", + "target_model_type": "qwen3_vl", + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 262144, + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_interleaved": true, + "mrope_section": [ + 24, + 20, + 20 + ], + "rope_type": "mrope" + }, + "rope_theta": 5000000, + "use_cache": true, + "vocab_size": 151936, + "tie_word_embeddings": false, + "transformers_version": "4.57.0.dev0", + "video_token_id": 151656, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "draft_vocab_size": 32000 +} From b894fbc4505ac6f4af084c3acfea037a206cead9 Mon Sep 17 00:00:00 2001 From: sunny Date: Fri, 24 Apr 2026 16:04:37 +0800 Subject: [PATCH 2/5] feat: add is_vlm param to safe_conversations_generator for multimodal support --- scripts/prepare_hidden_states.py | 2 +- scripts/train_eagle3.py | 4 ++-- specforge/utils.py | 16 +++++++++++++--- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/scripts/prepare_hidden_states.py b/scripts/prepare_hidden_states.py index 30ce9194..68287e42 100644 --- a/scripts/prepare_hidden_states.py +++ b/scripts/prepare_hidden_states.py @@ -618,7 +618,7 @@ def main(): print_with_rank("Loading/building dataset cache...") dataset = Dataset.from_generator( generator=safe_conversations_generator, - gen_kwargs={"file_path": args.data_path}, + gen_kwargs={"file_path": args.data_path, "is_vlm": args.is_vlm}, cache_dir=os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "cache", diff --git a/scripts/train_eagle3.py b/scripts/train_eagle3.py index 0bd157b3..ab262979 100644 --- a/scripts/train_eagle3.py +++ b/scripts/train_eagle3.py @@ -456,7 +456,7 @@ def build_dataloaders( cache_key = hashlib.md5(cache_params_string.encode()).hexdigest() train_dataset = Dataset.from_generator( generator=safe_conversations_generator, - gen_kwargs={"file_path": args.train_data_path}, + gen_kwargs={"file_path": args.train_data_path, "is_vlm": args.is_vlm}, ) is_online = ( args.train_data_path is not None and args.train_hidden_states_path is None @@ -507,7 +507,7 @@ def build_dataloaders( if args.eval_data_path is not None: eval_dataset = Dataset.from_generator( generator=safe_conversations_generator, - gen_kwargs={"file_path": args.eval_data_path}, + gen_kwargs={"file_path": args.eval_data_path, "is_vlm": args.is_vlm}, ) eval_eagle3_dataset = build_eagle3_dataset( eval_dataset, diff --git a/specforge/utils.py b/specforge/utils.py index af4d627c..eb0dcb99 100644 --- a/specforge/utils.py +++ b/specforge/utils.py @@ -3,6 +3,7 @@ import os import re from contextlib import contextmanager +from unittest import result import torch import torch.distributed as dist @@ -328,12 +329,15 @@ def shard_optimizer_state_with_dtensor(bf16_optimizer, device_mesh): ) -def safe_conversations_generator(file_path): +def safe_conversations_generator(file_path, is_vlm=False): """ Generator that: 1. Extracts the 'conversations' field. 2. Preserves all original fields within each message. 3. [Key step] Converts all list/dict-type field values to strings to resolve mixed-type conflicts (e.g., for Arrow compatibility). + Args: + file_path: Path to the JSONL file. + is_vlm: If True, include 'image' field for vision-language models. Default False. """ with open(file_path, "r", encoding="utf-8") as f: for i, line in enumerate(f): @@ -376,8 +380,14 @@ def safe_conversations_generator(file_path): cleaned_convs.append(new_msg) - # Build result with conversations - result = {"conversations": cleaned_convs} + if is_vlm: + image = row.get("image", "") + result = { + "conversation":cleaned_convs, + "image":image + } + else: + result = {"conversations": cleaned_convs} # Preserve 'tools' field if present if "tools" in row: From 9defccfe3a5b3aac0b5ad3f60b075527e4bf172a Mon Sep 17 00:00:00 2001 From: sunny Date: Fri, 24 Apr 2026 16:11:15 +0800 Subject: [PATCH 3/5] feat: add is_vlm param to safe_conversations_generator for multimodal support v2 --- configs/qwen3-vl-8b-instruct-eagle3.json | 41 ------------------------ 1 file changed, 41 deletions(-) delete mode 100644 configs/qwen3-vl-8b-instruct-eagle3.json diff --git a/configs/qwen3-vl-8b-instruct-eagle3.json b/configs/qwen3-vl-8b-instruct-eagle3.json deleted file mode 100644 index df60f887..00000000 --- a/configs/qwen3-vl-8b-instruct-eagle3.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLMEagle3" - ], - "image_token_id": 151655, - "model_type": "llama", - "target_model_type": "qwen3_vl", - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 151643, - "dtype": "bfloat16", - "eos_token_id": 151645, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 12288, - "max_position_embeddings": 262144, - "num_attention_heads": 32, - "num_hidden_layers": 1, - "num_key_value_heads": 8, - "rms_norm_eps": 1e-06, - "rope_scaling": { - "mrope_interleaved": true, - "mrope_section": [ - 24, - 20, - 20 - ], - "rope_type": "mrope" - }, - "rope_theta": 5000000, - "use_cache": true, - "vocab_size": 151936, - "tie_word_embeddings": false, - "transformers_version": "4.57.0.dev0", - "video_token_id": 151656, - "vision_end_token_id": 151653, - "vision_start_token_id": 151652, - "draft_vocab_size": 32000 -} From eadc83b752dc98275cd73ea94da611c3308d6e22 Mon Sep 17 00:00:00 2001 From: sunny Date: Fri, 24 Apr 2026 16:58:15 +0800 Subject: [PATCH 4/5] fix: fixed some formatting issues. --- specforge/utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/specforge/utils.py b/specforge/utils.py index eb0dcb99..f90c7d71 100644 --- a/specforge/utils.py +++ b/specforge/utils.py @@ -3,7 +3,6 @@ import os import re from contextlib import contextmanager -from unittest import result import torch import torch.distributed as dist @@ -383,11 +382,11 @@ def safe_conversations_generator(file_path, is_vlm=False): if is_vlm: image = row.get("image", "") result = { - "conversation":cleaned_convs, - "image":image + "conversations": cleaned_convs, + "image":image, } else: - result = {"conversations": cleaned_convs} + result = {"conversations": cleaned_convs} # Preserve 'tools' field if present if "tools" in row: From a91c6985618c019830cca98266ddd546cbe3629a Mon Sep 17 00:00:00 2001 From: sunny Date: Fri, 24 Apr 2026 17:13:16 +0800 Subject: [PATCH 5/5] fix: fixed some formatting issues v2 --- specforge/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specforge/utils.py b/specforge/utils.py index f90c7d71..83f81faa 100644 --- a/specforge/utils.py +++ b/specforge/utils.py @@ -383,7 +383,7 @@ def safe_conversations_generator(file_path, is_vlm=False): image = row.get("image", "") result = { "conversations": cleaned_convs, - "image":image, + "image": image, } else: result = {"conversations": cleaned_convs}