Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import json | |
| from typing import List, Dict, Optional, Tuple | |
| IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp', '.gif') | |
| def _join_posix(base: str, name: str) -> str: | |
| """Join path segments with forward slashes, avoiding duplicate separators.""" | |
| if not base: | |
| return name | |
| return base.rstrip('/') + '/' + name.lstrip('/') | |
| def _list_images(input_folder: str) -> List[str]: | |
| files = [] | |
| for entry in sorted(os.listdir(input_folder)): | |
| if entry.lower().endswith(IMAGE_EXTENSIONS): | |
| full = os.path.join(input_folder, entry) | |
| if os.path.isfile(full): | |
| files.append(entry) | |
| return files | |
| def _validate_controls_strict(filenames: List[str], control_dirs: List[str]) -> None: | |
| """Strictly validate that for each filename, a corresponding file exists | |
| in every specified control directory. Raises SystemExit on failure. | |
| """ | |
| if not control_dirs: | |
| return | |
| missing: List[str] = [] | |
| for fname in filenames: | |
| for idx, cdir in enumerate(control_dirs): | |
| expected = os.path.join(cdir, fname) | |
| if not os.path.exists(expected): | |
| label = f"control_dir_{idx}" | |
| missing.append(f"[{label}] {expected}") | |
| if missing: | |
| print("エラー: 以下のファイルが見つかりませんでした(strict):") | |
| for m in missing: | |
| print(" - " + m) | |
| print(f"合計 {len(missing)} 件の不足が見つかりました。処理を中断します。") | |
| raise SystemExit(1) | |
| def _normalize(name: str, prefix: str = "", suffix: str = "") -> str: | |
| stem = os.path.splitext(os.path.basename(name))[0] | |
| if prefix and stem.startswith(prefix): | |
| stem = stem[len(prefix):] | |
| if suffix and stem.endswith(suffix): | |
| stem = stem[: -len(suffix)] | |
| return stem | |
| def _list_image_files(folder: str) -> List[str]: | |
| return [fn for fn in sorted(os.listdir(folder)) | |
| if fn.lower().endswith(IMAGE_EXTENSIONS) and os.path.isfile(os.path.join(folder, fn))] | |
| def _build_entry( | |
| image_dir: str, | |
| control_dirs: List[str], | |
| caption: str, | |
| image_filename: str, | |
| ) -> Dict[str, str]: | |
| entry: Dict[str, str] = { | |
| "image_path": _join_posix(image_dir, image_filename), | |
| "caption": caption, | |
| } | |
| if len(control_dirs) == 1: | |
| entry["control_path"] = _join_posix(control_dirs[0], image_filename) | |
| elif len(control_dirs) > 1: | |
| for i, cdir in enumerate(control_dirs): | |
| entry[f"control_path_{i}"] = _join_posix(cdir, image_filename) | |
| return entry | |
| def create_image_caption_json_unified( | |
| input_folder: str, | |
| image_dir: str, | |
| control_dirs: List[Optional[str]], | |
| caption: str, | |
| output_json: str, | |
| *, | |
| target_prefix: str = "", | |
| target_suffix: str = "", | |
| control_prefixes: Optional[List[Optional[str]]] = None, | |
| control_suffixes: Optional[List[Optional[str]]] = None, | |
| allow_single: bool = True, | |
| ) -> None: | |
| """ | |
| 指定されたフォルダ内の画像ファイルを処理し、キャプションと共にJSONLファイルを作成します。 | |
| - コントロール無し: {image_path, caption} | |
| - 単一コントロール(--control_dir_0のみ): {image_path, control_path, caption} | |
| - 複数コントロール(--control_dir_0..7): {image_path, control_path_0..N, caption} | |
| 欠損ファイルは strict に扱い、不足があればエラー終了します。 | |
| """ | |
| filenames = _list_images(input_folder) | |
| # If no prefixes/suffixes provided, keep strict validation behavior | |
| use_name_matching = bool(target_prefix or target_suffix) | |
| if control_prefixes: | |
| use_name_matching = True | |
| if not use_name_matching: | |
| _validate_controls_strict(filenames, [d for d in control_dirs if d]) | |
| out_dir = os.path.dirname(output_json) | |
| if out_dir: | |
| os.makedirs(out_dir, exist_ok=True) | |
| count = 0 | |
| with open(output_json, 'w', encoding='utf-8') as f: | |
| if not use_name_matching: | |
| for fname in filenames: | |
| entry = _build_entry(image_dir, control_dirs, caption, fname) | |
| f.write(json.dumps(entry, ensure_ascii=False) + '\n') | |
| count += 1 | |
| else: | |
| # Build target keys | |
| target_keys = [_normalize(fn, target_prefix, target_suffix) for fn in filenames] | |
| # Write matched entries by constructing expected control names using add-rule | |
| for idx, fname in enumerate(filenames): | |
| entry: Dict[str, str] = { | |
| "image_path": _join_posix(image_dir, fname), | |
| "caption": caption, | |
| } | |
| present_count = 0 | |
| mkey = target_keys[idx] | |
| for i, cdir in enumerate(control_dirs): | |
| if not cdir: | |
| continue | |
| cprefix = (control_prefixes[i] if control_prefixes and i < len(control_prefixes) and control_prefixes[i] else "") | |
| csuffix = (control_suffixes[i] if control_suffixes and i < len(control_suffixes) and control_suffixes[i] else "") | |
| expected = f"{cprefix}{mkey}{csuffix}.png" | |
| expected_path = os.path.join(cdir, expected) | |
| if not os.path.exists(expected_path): | |
| # allow single fallback: any single image in cdir | |
| cfiles = _list_image_files(cdir) | |
| if allow_single and len(cfiles) == 1: | |
| expected = cfiles[0] | |
| expected_path = os.path.join(cdir, expected) | |
| else: | |
| raise SystemExit(f"対応するコントロールが見つかりません: expected={expected}, control_index={i}") | |
| field = "control_path" if present_count == 0 and sum(1 for d in control_dirs if d) == 1 else f"control_path_{present_count}" | |
| entry[field] = _join_posix(cdir, os.path.basename(expected_path)) | |
| present_count += 1 | |
| f.write(json.dumps(entry, ensure_ascii=False) + '\n') | |
| count += 1 | |
| print(f"処理が完了しました。{count}件を書き出しました。結果は {output_json} に保存されました。") | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description='画像とコントロールの対応JSONLを生成(厳格または名前マッチ)。') | |
| parser.add_argument('-i', '--input-folder', required=True, help='入力ディレクトリ(画像ファイルを列挙)') | |
| parser.add_argument('-c', '--caption', required=True, help='キャプション') | |
| parser.add_argument('-o', '--output-json', default='metadata.jsonl', help='出力JSONLパス(既定: metadata.jsonl)') | |
| parser.add_argument('--image-dir', default='/workspace/data/image', help='image_pathの親ディレクトリパス(JSON出力用)') | |
| parser.add_argument('--target_prefix', default=os.environ.get('TARGET_PREFIX', ''), help='ターゲット画像名から削除するプレフィックス') | |
| parser.add_argument('--target_suffix', default=os.environ.get('TARGET_SUFFIX', ''), help='ターゲット画像名から削除するサフィックス') | |
| # 最大 control_dir_0..7 まで受け付け | |
| for i in range(8): | |
| parser.add_argument( | |
| f'--control_dir_{i}', | |
| dest=f'control_dir_{i}', | |
| default=None, | |
| help=f'control_path_{i}の親ディレクトリパス(JSON出力用)', | |
| ) | |
| parser.add_argument( | |
| f'--control_prefix_{i}', | |
| dest=f'control_prefix_{i}', | |
| default=os.environ.get(f'CONTROL_PREFIX_{i}', ''), | |
| help=f'control_{i}のファイル名から削除するプレフィックス', | |
| ) | |
| parser.add_argument( | |
| f'--control_suffix_{i}', | |
| dest=f'control_suffix_{i}', | |
| default=os.environ.get(f'CONTROL_SUFFIX_{i}', ''), | |
| help=f'control_{i}のファイル名から削除するサフィックス', | |
| ) | |
| parser.add_argument('--allow_single', action='store_true', default=(os.environ.get('ALLOW_SINGLE', '1') == '1'), help='コントロールが1枚のみのとき全画像に適用') | |
| args = parser.parse_args() | |
| # 収集: 指定された control_dir_* のみ(順序は0->7) | |
| control_dirs: List[Optional[str]] = [] | |
| for i in range(8): | |
| val = getattr(args, f'control_dir_{i}') | |
| control_dirs.append(val) | |
| control_prefixes = [getattr(args, f'control_prefix_{i}') for i in range(8)] | |
| control_suffixes = [getattr(args, f'control_suffix_{i}') for i in range(8)] | |
| create_image_caption_json_unified( | |
| input_folder=args.input_folder, | |
| image_dir=args.image_dir, | |
| control_dirs=control_dirs, | |
| caption=args.caption, | |
| output_json=args.output_json, | |
| target_prefix=args.target_prefix, | |
| target_suffix=args.target_suffix, | |
| control_prefixes=control_prefixes, | |
| control_suffixes=control_suffixes, | |
| allow_single=args.allow_single, | |
| ) | |