From 89de92a2052d1daff6bce8e034818ed9c9d4ba7a Mon Sep 17 00:00:00 2001
From: AngelBottomless <aria1th@naver.com>
Date: Tue, 14 Jan 2025 03:49:22 +0900
Subject: [PATCH 1/2] fix weird timeout issue in concurrency

---
 infinity/dataset/dataset_t2i_iterable.py | 29 +++++++++++++++---------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/infinity/dataset/dataset_t2i_iterable.py b/infinity/dataset/dataset_t2i_iterable.py
index 5072910..406f39e 100644
--- a/infinity/dataset/dataset_t2i_iterable.py
+++ b/infinity/dataset/dataset_t2i_iterable.py
@@ -140,7 +140,7 @@ def set_h_div_w_template2generator(self,):
             filename = osp.basename(filepath)
             h_div_w_template, num_of_samples = osp.splitext(filename)[0].split('_')
             num_of_samples = int(num_of_samples)
-            if num_of_samples < self.global_workers:
+            if num_of_samples < max(100, self.global_workers):
                 print(f'{filepath} has too few examples ({num_of_samples}, proportion: {num_of_samples/total_samples*100:.1f}%), < global workers ({self.global_workers})! Skip h_div_w_template: {h_div_w_template}')
                 continue
             print(f'{filepath} has sufficient examples ({num_of_samples}), proportion: {num_of_samples/total_samples*100:.1f}%, > global workers ({self.global_workers})! Preserve h_div_w_template: {h_div_w_template}')
@@ -159,6 +159,7 @@ def split_meta_files(self, ):
         def split_and_sleep(generator_info):
             missing, chunk_id2save_files = get_part_jsonls(generator_info['filepath'], generator_info['num_of_samples'], parts=self.num_replicas)
             if missing:
+                print(f'[data preprocess] missing {missing} files, sleep 10 minutes...')
                 tdist.barrier()
                 if self.rank == 0:
                     split_large_txt_files(generator_info['filepath'], chunk_id2save_files)
@@ -169,15 +170,20 @@ def split_and_sleep(generator_info):
                 tdist.barrier()
             generator_info['part_filepaths'] = sorted(list(chunk_id2save_files.values()))
             return generator_info
-
-        with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_count()) as executor:
-            futures = {executor.submit(split_and_sleep, generator_info): h_div_w_template for h_div_w_template, generator_info in self.h_div_w_template2generator.items()}
-            for future in concurrent.futures.as_completed(futures):
-                h_div_w_template = futures[future]
-                try:
-                    self.h_div_w_template2generator[h_div_w_template] = future.result()
-                except Exception as exc:
-                    print(f'[data preprocess] h_div_w_template {h_div_w_template} generated an exception: {exc}')
+        if False:
+            with concurrent.futures.ThreadPoolExecutor(max_workers=min(32, cpu_count())) as executor:
+                print(f'[data preprocess] split_meta_files with {min(32, cpu_count())} threads')
+                futures = {executor.submit(split_and_sleep, generator_info): h_div_w_template for h_div_w_template, generator_info in self.h_div_w_template2generator.items()}
+                for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc='split_meta_files'):
+                    h_div_w_template = futures[future]
+                    try:
+                        self.h_div_w_template2generator[h_div_w_template] = future.result()
+                        print(f'[data preprocess] h_div_w_template {h_div_w_template} split_meta_files done')
+                    except Exception as exc:
+                        print(f'[data preprocess] h_div_w_template {h_div_w_template} generated an exception: {exc}')
+        else:
+            for h_div_w_template, generator_info in self.h_div_w_template2generator.items():
+                self.h_div_w_template2generator[h_div_w_template] = split_and_sleep(generator_info)
 
         print('[data preprocess] split_meta_files done')
 
@@ -261,6 +267,7 @@ def __iter__(self):
                         c_, h_, w_ = model_input[1].shape[-3:]
                         if c_ != 3 or np.abs(h_/w_-float(h_div_w_template)) > 0.01:
                             print(f'Croupt data item: {data_item}')
+                            print(f'c_: {c_}, h_: {h_}, w_: {w_}, h_div_w_template: {h_div_w_template}, error: {np.abs(h_/w_-float(h_div_w_template))}')
                         else:
                             batch_data.append(model_input)
                     del data_item
@@ -323,7 +330,7 @@ def get_text_input(self, long_text_input, short_text_input, long_text_type):
 
     def prepare_model_input(self, data_item) -> Tuple:
         img_path, h_div_w = data_item['image_path'], data_item['h_div_w']
-        short_text_input, long_text_input = data_item['text'], data_item['long_caption']
+        short_text_input, long_text_input = data_item.get('text', 'long_caption'), data_item['long_caption']
         long_text_type = data_item.get('long_caption_type', 'user_prompt')
         text_input = self.get_text_input(long_text_input, short_text_input, long_text_type)
         text_input = process_short_text(text_input)

From 666f61c6ac4061f88165bed29e476c4c37e97446 Mon Sep 17 00:00:00 2001
From: AngelBottomless <aria1th@naver.com>
Date: Tue, 14 Jan 2025 03:51:51 +0900
Subject: [PATCH 2/2] Create split-builder.py

---
 infinity/dataset/split-builder.py | 177 ++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 infinity/dataset/split-builder.py

diff --git a/infinity/dataset/split-builder.py b/infinity/dataset/split-builder.py
new file mode 100644
index 0000000..c79111a
--- /dev/null
+++ b/infinity/dataset/split-builder.py
@@ -0,0 +1,177 @@
+import os
+import json
+import glob
+import math
+import multiprocessing
+from tqdm import tqdm
+from PIL import Image
+
+MARKER=".finished"
+# Paths
+ROOT_DIR = "."        # Root directory that contains 'images'
+IMAGES_DIR = os.path.join(ROOT_DIR, "images")
+OUTPUT_DIR = os.path.join(ROOT_DIR, "split")
+ratio2hws = {
+    1.000: [(1,1),(2,2),(4,4),(6,6),(8,8),(12,12),(16,16),(20,20),(24,24),(32,32),(40,40),(48,48),(64,64)],
+    1.250: [(1,1),(2,2),(3,3),(5,4),(10,8),(15,12),(20,16),(25,20),(30,24),(35,28),(45,36),(55,44),(70,56)],
+    1.333: [(1,1),(2,2),(4,3),(8,6),(12,9),(16,12),(20,15),(24,18),(28,21),(36,27),(48,36),(60,45),(72,54)],
+    1.500: [(1,1),(2,2),(3,2),(6,4),(9,6),(15,10),(21,14),(27,18),(33,22),(39,26),(48,32),(63,42),(78,52)],
+    1.750: [(1,1),(2,2),(3,3),(7,4),(11,6),(14,8),(21,12),(28,16),(35,20),(42,24),(56,32),(70,40),(84,48)],
+    2.000: [(1,1),(2,2),(4,2),(6,3),(10,5),(16,8),(22,11),(30,15),(38,19),(46,23),(60,30),(74,37),(90,45)],
+    2.500: [(1,1),(2,2),(5,2),(10,4),(15,6),(20,8),(25,10),(30,12),(40,16),(50,20),(65,26),(80,32),(100,40)],
+    3.000: [(1,1),(2,2),(6,2),(9,3),(15,5),(21,7),(27,9),(36,12),(45,15),(54,18),(72,24),(90,30),(111,37)],
+}
+
+def get_ratio(image_path):
+    """
+    Return the ratio h_div_w for the given image.
+    """
+    with Image.open(image_path) as im:
+        w, h = im.size
+    # Avoid division by zero
+    if w == 0:
+        return 0.0
+    # get ratio from ratio2hws, if error > 0.1, set to 0 for indicating skip
+    ratio = h / w
+    nearest_ratio = min(ratio2hws.keys(), key=lambda x: abs(x - ratio))
+    if abs(nearest_ratio - ratio) > 0.1:
+        return 0.0
+    return nearest_ratio
+
+def process_subfolder(subfolder_path):
+    """
+    Pass 1:
+    Process a single subfolder to create partial results.
+      1) Check .finished -> skip if exists
+      2) Find .webp & matching .txt
+      3) Compute ratio, read caption
+      4) Write partial results to a local file (partial.json)
+      5) Mark subfolder as .finished
+    """
+    finished_marker = os.path.join(subfolder_path, MARKER)
+    if os.path.exists(finished_marker):
+        # Already processed
+        return
+
+    items = []
+    
+    # Gather all .webp files in the subfolder
+    webp_files = glob.glob(os.path.join(subfolder_path, "*.webp"))
+    for webp_path in webp_files:
+        base_name = os.path.splitext(os.path.basename(webp_path))[0]
+        txt_path = os.path.join(subfolder_path, base_name + ".txt")
+        
+        if not os.path.exists(txt_path):
+            # No matching text file -> skip
+            continue
+        
+        # Compute ratio
+        ratio = get_ratio(webp_path)
+        if ratio == 0.0:
+            # Skip this image
+            continue
+        
+        # Read long caption from .txt
+        with open(txt_path, "r", encoding="utf-8") as f:
+            long_caption = f.read().strip()
+
+        # Create item (skip short_caption_type & text)
+        item = {
+            "image_path": webp_path,
+            "h_div_w": ratio,
+            "long_caption": long_caption,
+            "long_caption_type": "tag"
+        }
+        items.append(item)
+    
+    # Write partial results to local file
+    partial_path = os.path.join(subfolder_path, "partial.json")
+    with open(partial_path, "w", encoding="utf-8") as pf:
+        json.dump(items, pf, ensure_ascii=False)
+    
+    # Mark subfolder as finished
+    with open(finished_marker, "w", encoding="utf-8") as fm:
+        fm.write("done\n")
+
+def gather_partial_results():
+    """
+    Pass 2:
+    Read partial.json from all subfolders and create final JSONL outputs.
+    """
+    # 1) Collect all partial.json files
+    partial_files = []
+    for folder in os.listdir(IMAGES_DIR):
+        subfolder_path = os.path.join(IMAGES_DIR, folder)
+        if not os.path.isdir(subfolder_path):
+            continue
+        
+        partial_path = os.path.join(subfolder_path, "partial.json")
+        if os.path.exists(partial_path):
+            partial_files.append(partial_path)
+
+    ratio_map = {}
+
+    # 2) Read each partial.json and group items by ratio
+    for pf in tqdm(partial_files, desc="Gathering partial results"):
+        with open(pf, "r", encoding="utf-8") as f:
+            items = json.load(f)
+            for it in items:
+                ratio_rounded = round(it["h_div_w"], 3)
+                if ratio_rounded not in ratio_map:
+                    ratio_map[ratio_rounded] = []
+                ratio_map[ratio_rounded].append(it)
+    
+    # 3) Write each ratio group to final JSONL in OUTPUT_DIR
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    
+    for ratio, items in ratio_map.items():
+        # Example filename: 1.000_000002500.jsonl
+        ratio_str = f"{ratio:.3f}"
+        count = len(items)
+        filename = f"{ratio_str}_{count:09d}.jsonl"
+        output_path = os.path.join(OUTPUT_DIR, filename)
+
+        # Write to JSONL (append or overwrite as needed)
+        # Usually you'd just overwrite once, but if you want 
+        # to do multiple passes, you could open with 'a' (append).
+        with open(output_path, "w", encoding="utf-8") as of:
+            for record in items:
+                # Keep only required fields
+                out_record = {
+                    "image_path": record["image_path"],
+                    "h_div_w": record["h_div_w"],
+                    "long_caption": record["long_caption"],
+                    "long_caption_type": record["long_caption_type"]
+                }
+                of.write(json.dumps(out_record, ensure_ascii=False) + "\n")
+    
+    print(f"Done! Created {len(ratio_map)} grouped JSONL files in {OUTPUT_DIR}")
+
+def main():
+    # --- PASS 1 ---
+    # Process each subfolder in parallel to create partial results
+    subfolders = [
+        os.path.join(IMAGES_DIR, d)
+        for d in os.listdir(IMAGES_DIR)
+        if os.path.isdir(os.path.join(IMAGES_DIR, d))
+    ]
+    subfolders.sort()
+
+    cpu_count = min(24, max(1, multiprocessing.cpu_count() - 1))
+    pool = multiprocessing.Pool(cpu_count)
+
+    # Multiprocessing with a progress bar
+    with tqdm(total=len(subfolders), desc="Processing subfolders") as pbar:
+        for _ in pool.imap_unordered(process_subfolder, subfolders):
+            pbar.update(1)
+
+    pool.close()
+    pool.join()
+
+    # --- PASS 2 ---
+    # Now gather partial.json from each subfolder to produce final JSONLs
+    gather_partial_results()
+
+
+if __name__ == "__main__":
+    main()