diff --git a/.gitignore b/.gitignore index 95d4d1e..aa49cf9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ #custom -synth_pipeline/ + # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/midst_models/multi_table_ClavaDDPM/slurm_scripts/clavaddpm.sh b/midst_models/multi_table_ClavaDDPM/slurm_scripts/clavaddpm.sh new file mode 100755 index 0000000..bc649b1 --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/slurm_scripts/clavaddpm.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +main_dataset_dir="/projects/aieng/midst_competition/data/clavaddpm_scratch" +log_dir="/projects/aieng/midst_competition/logs/clavaddpm_scratch" + +datasets=($(ls -d $main_dataset_dir/*/ | sort)) + +num_gpus=7 +batch_size=7 +num_datasets=${#datasets[@]} + +for (( i=0; i<$num_datasets; i+=batch_size )); do + batch=("${datasets[@]:i:batch_size}") + job_ids=() + for dataset in "${batch[@]}"; do + dataset_name=$(basename $dataset) + output_log="${log_dir}/${dataset_name}_%j.log" + echo "Submitting job for dataset: $dataset_name" + echo "Dataset path: $dataset" + job_id=$(sbatch --job-name="${dataset_name}" \ + --output=$output_log \ + --export=DATASET_DIR=$dataset \ + clavaddpm.slrm &) + job_ids+=($job_id) + if [[ $(jobs -r -p | wc -l) -ge $num_gpus ]]; then + wait -n + fi + done + echo "Sleeping for 6 hours before submitting the next batch..." + sleep 21600 + + # echo "Waiting for batch $((i/batch_size + 1)) to complete..." + # for job_id in "${job_ids[@]}"; do + # while squeue -j $job_id > /dev/null 2>&1; do + # echo "Job $job_id is still running. Checking again in 60 seconds." + # sleep 300 + # done + # done + echo "Batch $((i/batch_size + 1)) complete." +done diff --git a/midst_models/multi_table_ClavaDDPM/slurm_scripts/clavaddpm.slrm b/midst_models/multi_table_ClavaDDPM/slurm_scripts/clavaddpm.slrm new file mode 100644 index 0000000..27e3c57 --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/slurm_scripts/clavaddpm.slrm @@ -0,0 +1,17 @@ +#!/bin/bash +#SBATCH -p rtx6000 +#SBATCH --job-name=train +#SBATCH --output=output_%j.log +#SBATCH --gres=gpu:1 +#SBATCH --qos=m +#SBATCH -c 4 +#SBATCH --mem=30G +#SBATCH --ntasks=1 +#SBATCH --time=12:00:00 + +export HYDRA_FULL_ERROR=1 +source /scratch/ssd004/scratch/malinoori/poetry/midst-competition-rGytac8L-py3.9/bin/activate +cd /fs01/home/malinoori/MIDSTModels/midst_models/multi_table_ClavaDDPM/synth_pipeline +python train.py data_dir="$DATASET_DIR" +python sample.py data_dir="$DATASET_DIR" +# python evaluate_all.py \ No newline at end of file diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/__init__.py b/midst_models/multi_table_ClavaDDPM/synth_pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/evaluate.yaml b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/evaluate.yaml new file mode 100644 index 0000000..4a8d73d --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/evaluate.yaml @@ -0,0 +1,13 @@ +defaults: + - _self_ + +data_dir: /projects/aieng/midst_competition/data/tabddpm/tabddpm_1 +main_data_path: /projects/aieng/midst_competition/data/tabddpm/trans.csv +data_name: "trans" +exp_name: "train_1" +eval_flags: + eval_density: True + eval_quality: True + eval_dcr: True + eval_detection: True + eval_mle: True diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/evaluate_all.yaml b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/evaluate_all.yaml new file mode 100644 index 0000000..441849d --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/evaluate_all.yaml @@ -0,0 +1,13 @@ +defaults: + - _self_ + +base_dir: /projects/aieng/midst_competition/data/clavaddpm_round2 +main_data_path: /projects/aieng/midst_competition/data/tabddpm/trans.csv +data_name: "trans" +exp_name: "train_1" +eval_flags: + eval_density: True + eval_quality: True + eval_dcr: True + eval_detection: True + eval_mle: True diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/generate_subsets.yaml b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/generate_subsets.yaml new file mode 100644 index 0000000..244ceb1 --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/generate_subsets.yaml @@ -0,0 +1,10 @@ +main_data_dir: /projects/aieng/midst_competition/data/berka/clavaddpm # Path to the main data directory where the whole dataset is stored along with config templates +save_data_dir: /projects/aieng/midst_competition/data/clavaddpm_wb # Path to the save data directory where the subsets will be saved +info_dir: /projects/aieng/midst_competition/data/clavaddpm/clavaddpm_1/eval_files # Path to the info directory where the info.json file is stored +data_name: trans # Name of the dataset +files_to_exclude: ['info.json', 'info_regression.json', 'trans.json'] + +num_subsets: 20 # Number of subsets to create +subset_size: 20000 # Number of rows in each subset +test_size: 10000 # Number of rows in the test set +challenge_size: 200 # Number of rows in the challenge set \ No newline at end of file diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/sample.yaml b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/sample.yaml new file mode 100644 index 0000000..e5cd2c5 --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/sample.yaml @@ -0,0 +1,6 @@ +defaults: + - _self_ + +data_dir: /projects/aieng/midst_competition/data/clavaddpm/clavaddpm_1 +data_name: "trans" +exp_name: "train_1" \ No newline at end of file diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/train.yaml b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/train.yaml new file mode 100644 index 0000000..95c389c --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/config/train.yaml @@ -0,0 +1,7 @@ +defaults: + - _self_ + +data_dir: /projects/aieng/midst_competition/data/clavaddpm_scratch/clavaddpm_1 +data_name: "trans" +exp_name: "train_1" +# exp_name: "update" \ No newline at end of file diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/evaluate.py b/midst_models/multi_table_ClavaDDPM/synth_pipeline/evaluate.py new file mode 100644 index 0000000..39a7312 --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/evaluate.py @@ -0,0 +1,99 @@ +import os +import json +import torch +import hydra +import logging +from omegaconf import DictConfig +from pprint import pprint + +from midst_models.single_table_TabSyn.scripts.eval.eval_density import eval_density +from midst_models.single_table_TabSyn.scripts.eval.eval_quality import eval_quality +from midst_models.single_table_TabSyn.scripts.eval.eval_mle import eval_mle +from midst_models.single_table_TabSyn.scripts.eval.eval_dcr import eval_dcr +from midst_models.single_table_TabSyn.scripts.eval.eval_detection import eval_detection + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="evaluate", version_base="1.1") +def main(cfg: DictConfig): + data_dir = cfg.data_dir + exp_name = cfg.exp_name + data_name = cfg.data_name + workspace_dir = os.path.join(data_dir, "workspace", exp_name) + + main_data_path = cfg.main_data_path + train_data_path = os.path.join(data_dir, "train.csv") + test_data_path = os.path.join(data_dir, "test.csv") + synth_data_path = os.path.join(workspace_dir, data_name, "_final", f"{data_name}_synthetic.csv") + + + info_path = os.path.join(data_dir, "info.json") + + eval_results = {} + + # 1. Density Evaluation + if cfg.eval_flags.eval_density: + logger.info("Running density evaluation for train data...") + shape, trend = eval_density(synth_data_path, train_data_path, info_path) + eval_results['density_shape_train'] = shape + eval_results['density_trend_train'] = trend + logger.info(f"Shape: {shape}, Trend: {trend}") + + logger.info("Running density evaluation for all data...") + shape, trend = eval_density(synth_data_path, main_data_path, info_path) + eval_results['density_shape_all'] = shape + eval_results['density_trend_all'] = trend + logger.info(f"Shape: {shape}, Trend: {trend}") + + # 2. Quality Evaluation + if cfg.eval_flags.eval_quality: + logger.info("Running quality evaluation...") + alpha_precision, beta_recall = eval_quality(synth_data_path, train_data_path, info_path) + eval_results['alpha_precision'] = alpha_precision + eval_results['beta_recall'] = beta_recall + logger.info(f"Alpha Precision: {alpha_precision}, Beta Recall: {beta_recall}") + + # 3. DCR (Distance to Closest Record) Evaluation + if cfg.eval_flags.eval_dcr: + logger.info("Running DCR evaluation...") + with open(info_path, "r") as file: + data_info = json.load(file) + ideal_dcr = data_info["train_num"] / (data_info["train_num"] + data_info["test_num"]) + + dcr_score = eval_dcr(synth_data_path, train_data_path, test_data_path, info_path) + eval_results['dcr_score'] = dcr_score + eval_results['ideal_dcr'] = ideal_dcr + logger.info(f"DCR Score: {dcr_score}, Ideal DCR: {ideal_dcr}") + + # 4. Detection Evaluation + if cfg.eval_flags.eval_detection: + logger.info("Running detection evaluation for train data...") + detection_score = eval_detection(synth_data_path, train_data_path, info_path, data_name, model="tabddpm") + eval_results['detection_score_train'] = detection_score + logger.info(f"Detection Score: {detection_score}") + + logger.info("Running detection evaluation for all data...") + detection_score = eval_detection(synth_data_path, main_data_path, info_path, data_name, model="tabddpm") + eval_results['detection_score_all'] = detection_score + logger.info(f"Detection Score: {detection_score}") + + # 5. MLE Evaluation + if cfg.eval_flags.eval_mle: + logger.info("Running MLE evaluation...") + mle_synth_score = eval_mle(synth_data_path, test_data_path, info_path) + mle_baseline_score = eval_mle(train_data_path, test_data_path, info_path) + eval_results['mle_synth_score'] = mle_synth_score + eval_results['mle_baseline_score'] = mle_baseline_score + logger.info("MLE Scores:") + logger.info(f"Synthetic: {mle_synth_score}, Baseline: {mle_baseline_score}") + + # Save evaluation results to a file + eval_json_path = os.path.join(workspace_dir, f"{data_name}_eval.json") + with open(eval_json_path, 'w') as eval_file: + json.dump(eval_results, eval_file, indent=4) + + logger.info(f"Evaluation results saved to {eval_json_path}") + + +if __name__ == "__main__": + main() diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/evaluate_all.py b/midst_models/multi_table_ClavaDDPM/synth_pipeline/evaluate_all.py new file mode 100644 index 0000000..f19f757 --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/evaluate_all.py @@ -0,0 +1,160 @@ +import os +import json +import torch +import hydra +import logging +import pandas as pd +from omegaconf import DictConfig +from pathlib import Path + +from midst_models.single_table_TabSyn.scripts.eval.eval_density import eval_density +from midst_models.single_table_TabSyn.scripts.eval.eval_quality import eval_quality +from midst_models.single_table_TabSyn.scripts.eval.eval_mle import eval_mle +from midst_models.single_table_TabSyn.scripts.eval.eval_dcr import eval_dcr +from midst_models.single_table_TabSyn.scripts.eval.eval_detection import eval_detection + +from midst_models.multi_table_ClavaDDPM.complex_pipeline import ( + clava_clustering, + clava_load_pretrained, + clava_load_synthesized_data, + clava_eval, + load_configs +) +from midst_models.multi_table_ClavaDDPM.pipeline_utils import load_multi_table +from midst_models.multi_table_ClavaDDPM.report_utils import get_multi_metadata + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + +# List of tables to process +TABLES = ['account', 'card', 'client', 'disp', 'district', 'loan', 'order', 'trans'] + +@hydra.main(config_path="config", config_name="evaluate_all", version_base="1.1") +def main(cfg: DictConfig): + base_dir = Path(cfg.base_dir) # Parent directory containing clavaddpm_1 to clavaddpm_70 + + # Iterate through all subdirectories clavaddpm_1 to clavaddpm_70 + for i in range(92, 111): + subdir_name = f"clavaddpm_{i}" + subdir_path = base_dir / subdir_name + exp_dir = subdir_path / "workspace" / cfg.exp_name + + if not subdir_path.exists() or not subdir_path.is_dir(): + raise FileNotFoundError(f"Subdirectory {subdir_path} does not exist or is not a directory. Skipping.") + + logger.info(f"Processing subdirectory: {subdir_path}") + + eval_dir = subdir_path / "eval_files" + + eval_results = {} + + for table in TABLES: + # Paths + synth_data_with_id = subdir_path / "workspace" / cfg.exp_name / table / "_final" / f"{table}_synthetic.csv" + synth_df = pd.read_csv(synth_data_with_id) + synth_df_no_id = synth_df.drop(columns=[col for col in synth_df.columns if col.endswith("_id")]) + + info_json_path = subdir_path / "eval_files" / f"{table}_info.json" + with open(info_json_path, 'r') as f: + info_data = json.load(f) + + cat_col_idx = info_data.get("cat_col_idx", []) + target_col_idx = info_data.get("target_col_idx", []) + task_type = info_data.get("task_type", "") + + if task_type != "regression": + cat_col_idx += target_col_idx + + for idx in cat_col_idx: + col_name = synth_df_no_id.columns[idx] + synth_df_no_id[col_name] = synth_df_no_id[col_name].apply( + lambda x: int(float(x)) if str(x).replace('.0', '').isdigit() else x + ) + synth_data_no_id = subdir_path / "workspace" / cfg.exp_name / table / "_final" / f"{table}_synthetic_no_id.csv" + synth_df_no_id.to_csv(synth_data_no_id, index=False) + + info_path = eval_dir / f"{table}_info.json" + train_data_path = eval_dir / f"{table}_no_id.csv" + synth_data_path = subdir_path / "workspace" / cfg.exp_name / table / "_final" / f"{table}_synthetic_no_id.csv" + + if not info_path.exists(): + raise FileNotFoundError(f"{info_path} does not exist. Skipping updating data_path.") + + with open(info_path, 'r') as f: + info_data = json.load(f) + logger.info(f"Loaded {info_path}") + + if cfg.eval_flags.eval_density: + + shape, trend = eval_density(str(synth_data_path), str(train_data_path), str(info_path)) + eval_results[f'density_shape_train_{table}'] = shape + eval_results[f'density_trend_train_{table}'] = trend + logger.info(f"Density Evaluation for {table}: Shape={shape}, Trend={trend}") + + + # Other Evaluations only for 'trans' table + if table == 'trans': + test_data_path = eval_dir / "test_no_id.csv" # Assuming test_no_id.csv is in eval_files + main_data_path = cfg.main_data_path + + if cfg.eval_flags.eval_density: + shape, trend = eval_density(synth_data_path, main_data_path, info_path) + eval_results['density_shape_all'] = shape + eval_results['density_trend_all'] = trend + logger.info(f"Shape: {shape}, Trend: {trend}") + + if cfg.eval_flags.eval_quality: + alpha_precision, beta_recall = eval_quality(str(synth_data_path), str(train_data_path), str(info_path)) + eval_results['alpha_precision_trans'] = alpha_precision + eval_results['beta_recall_trans'] = beta_recall + logger.info(f"Quality Evaluation for trans: Alpha Precision={alpha_precision}, Beta Recall={beta_recall}") + + if cfg.eval_flags.eval_dcr: + ideal_dcr = info_data["train_num"] / (info_data["train_num"] + info_data["test_num"]) + dcr_score = eval_dcr(str(synth_data_path), str(train_data_path), str(test_data_path), str(info_path)) + eval_results['dcr_score_trans'] = dcr_score + eval_results['ideal_dcr_trans'] = ideal_dcr + logger.info(f"DCR Evaluation for trans: DCR Score={dcr_score}, Ideal DCR={ideal_dcr}") + + if cfg.eval_flags.eval_detection: + detection_score = eval_detection(str(synth_data_path), str(train_data_path), str(info_path), cfg.data_name, model="clavaddpm") + eval_results['detection_score_train_trans'] = detection_score + logger.info(f"Detection Evaluation for trans (train): Detection Score={detection_score}") + + detection_score_all = eval_detection(str(synth_data_path), str(main_data_path), str(info_path), cfg.data_name, model="clavaddpm") + eval_results['detection_score_all_trans'] = detection_score_all + logger.info(f"Detection Evaluation for trans (all): Detection Score={detection_score_all}") + + + if cfg.eval_flags.eval_mle: + mle_synth_score = eval_mle(str(synth_data_path), str(test_data_path), str(info_path)) + mle_baseline_score = eval_mle(str(train_data_path), str(test_data_path), str(info_path)) + eval_results['mle_synth_score_trans'] = mle_synth_score + eval_results['mle_baseline_score_trans'] = mle_baseline_score + logger.info(f"MLE Evaluation for trans: Synthetic Score={mle_synth_score}, Baseline Score={mle_baseline_score}") + + tables, relation_order, dataset_meta = load_multi_table(subdir_path) + cleaned_tables = clava_load_synthesized_data(tables.keys(), exp_dir) + config_path = subdir_path / f"{cfg.data_name}.json" + configs, _ = load_configs(config_path) + + report = clava_eval(tables, subdir_path, configs, relation_order, cleaned_tables) + for key, val in report.items(): + if key in ['avg_scores', 'all_avg_score']: + if key == 'avg_scores': + for k, v in val.items(): + logger.info("{:}-hop column correlation: {}".format(k, v)) + eval_results[f"{k}_hop_column_correlation"] = v + elif key == 'all_avg_score': + logger.info("{:}: {}".format(key, val)) + eval_results[key] = val + + # Save all evaluation results to eval.json in the subdir + eval_json_path = subdir_path / "eval.json" + with open(eval_json_path, 'w') as eval_file: + json.dump(eval_results, eval_file, indent=4) + + +if __name__ == "__main__": + main() diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/generate_subsets.py b/midst_models/multi_table_ClavaDDPM/synth_pipeline/generate_subsets.py new file mode 100644 index 0000000..28b7ac8 --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/generate_subsets.py @@ -0,0 +1,229 @@ +import glob +import os +import json +import pandas as pd +import numpy as np +import hydra +import logging +import shutil +from pathlib import Path +from omegaconf import DictConfig +from sklearn.model_selection import train_test_split + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="generate_subsets", version_base="1.1") +def generate_subsets(cfg: DictConfig): + + num_subsets = cfg.num_subsets + subset_size = cfg.subset_size + test_size = cfg.test_size + challenge_size = cfg.challenge_size // 2 + + os.makedirs(cfg.save_data_dir, exist_ok=True) + + account_df = pd.read_csv(os.path.join(cfg.main_data_dir, 'account.csv')) + card_df = pd.read_csv(os.path.join(cfg.main_data_dir, 'card.csv')) + client_df = pd.read_csv(os.path.join(cfg.main_data_dir, 'client.csv')) + disp_df = pd.read_csv(os.path.join(cfg.main_data_dir, 'disp.csv')) + district_df = pd.read_csv(os.path.join(cfg.main_data_dir, 'district.csv')) + loan_df = pd.read_csv(os.path.join(cfg.main_data_dir, 'loan.csv')) + order_df = pd.read_csv(os.path.join(cfg.main_data_dir, 'order.csv')) + trans_df = pd.read_csv(os.path.join(cfg.main_data_dir, 'trans.csv')) + + main_clean = trans_df.drop(columns=[col for col in trans_df.columns if col.endswith('_id')]) + main_clean_file = os.path.join(cfg.save_data_dir, f'{cfg.data_name}.csv') + main_clean.to_csv(main_clean_file, index=False) + logger.info(f"Main dataset (without _id columns) saved at {main_clean_file}") + + id_columns = ['account_id', 'client_id', 'disp_id', 'trans_id', 'card_id', 'loan_id', 'order_id', 'district_id'] + for df in [account_df, card_df, client_df, disp_df, loan_df, order_df, trans_df]: + for col in id_columns: + if col in df.columns: + df[col] = df[col].astype(int) + + # Begin processing for each subset + for subset_num in range(71, num_subsets + 71): + print(f'Processing subset {subset_num}...') + + # Create a subdirectory for this subset + subset_dir = os.path.join(cfg.save_data_dir, f'clavaddpm_{subset_num}') + os.makedirs(subset_dir, exist_ok=True) + + eval_dir = os.path.join(subset_dir, 'eval_files') + os.makedirs(eval_dir, exist_ok=True) + + seed = subset_num * 100 + 36 + rng = np.random.RandomState(seed=seed) + + # Sample uniformly 20k from trans data + sampled_trans_df = trans_df.sample(n=subset_size, random_state=rng) + + # Extract corresponding account records using account_id + sampled_account_ids = sampled_trans_df['account_id'].unique() + sampled_account_df = account_df[account_df['account_id'].isin(sampled_account_ids)] + + # Extract corresponding district records from account + sampled_district_ids = sampled_account_df['district_id'].unique() + sampled_district_df = district_df[district_df['district_id'].isin(sampled_district_ids)] + + # Extract corresponding order, loan, disposition + sampled_disp_df = disp_df[disp_df['account_id'].isin(sampled_account_ids)] + sampled_order_df = order_df[order_df['account_id'].isin(sampled_account_ids)] + sampled_loan_df = loan_df[loan_df['account_id'].isin(sampled_account_ids)] + + # From disposition extract corresponding card + sampled_card_ids = sampled_disp_df['disp_id'].unique() + sampled_card_df = card_df[card_df['disp_id'].isin(sampled_card_ids)] + + # From disposition extract corresponding client + sampled_client_ids = sampled_disp_df['client_id'].unique() + sampled_client_df = client_df[client_df['client_id'].isin(sampled_client_ids)] + + # From client extract corresponding district and add any new districts + client_district_ids = sampled_client_df['district_id'].unique() + additional_district_ids = np.setdiff1d(client_district_ids, sampled_district_ids) + if additional_district_ids.size > 0: + additional_district_df = district_df[district_df['district_id'].isin(additional_district_ids)] + sampled_district_df = pd.concat([sampled_district_df, additional_district_df]).drop_duplicates() + + # Save these files in their own subdir + sampled_trans_df.to_csv(os.path.join(subset_dir, 'trans.csv'), index=False) + sampled_account_df.to_csv(os.path.join(subset_dir, 'account.csv'), index=False) + sampled_district_df.to_csv(os.path.join(subset_dir, 'district.csv'), index=False) + sampled_order_df.to_csv(os.path.join(subset_dir, 'order.csv'), index=False) + sampled_loan_df.to_csv(os.path.join(subset_dir, 'loan.csv'), index=False) + sampled_disp_df.to_csv(os.path.join(subset_dir, 'disp.csv'), index=False) + sampled_card_df.to_csv(os.path.join(subset_dir, 'card.csv'), index=False) + sampled_client_df.to_csv(os.path.join(subset_dir, 'client.csv'), index=False) + + # save files without id + trans_df_no_id = sampled_trans_df.drop(columns=[col for col in sampled_trans_df.columns if col.endswith('_id')]) + trans_df_no_id.to_csv(os.path.join(eval_dir, f'trans_no_id.csv'), index=False) + account_df_no_id = sampled_account_df.drop(columns=[col for col in sampled_account_df.columns if col.endswith('_id')]) + account_df_no_id.to_csv(os.path.join(eval_dir, f'account_no_id.csv'), index=False) + district_df_no_id = sampled_district_df.drop(columns=[col for col in sampled_district_df.columns if col.endswith('_id')]) + district_df_no_id.to_csv(os.path.join(eval_dir, f'district_no_id.csv'), index=False) + order_df_no_id = sampled_order_df.drop(columns=[col for col in sampled_order_df.columns if col.endswith('_id')]) + order_df_no_id.to_csv(os.path.join(eval_dir, f'order_no_id.csv'), index=False) + loan_df_no_id = sampled_loan_df.drop(columns=[col for col in sampled_loan_df.columns if col.endswith('_id')]) + loan_df_no_id.to_csv(os.path.join(eval_dir, f'loan_no_id.csv'), index=False) + disp_df_no_id = sampled_disp_df.drop(columns=[col for col in sampled_disp_df.columns if col.endswith('_id')]) + disp_df_no_id.to_csv(os.path.join(eval_dir, f'disp_no_id.csv'), index=False) + card_df_no_id = sampled_card_df.drop(columns=[col for col in sampled_card_df.columns if col.endswith('_id')]) + card_df_no_id.to_csv(os.path.join(eval_dir, f'card_no_id.csv'), index=False) + client_df_no_id = sampled_client_df.drop(columns=[col for col in sampled_client_df.columns if col.endswith('_id')]) + client_df_no_id.to_csv(os.path.join(eval_dir, f'client_no_id.csv'), index=False) + + # create the challenge set + rng = np.random.RandomState(seed=seed + 37) + challenge_in_subset = sampled_trans_df.sample(n=challenge_size, random_state=rng) + + rng = np.random.RandomState(seed=seed + 38) + challenge_out_subset = trans_df[~trans_df.index.isin(sampled_trans_df.index)].sample( + n=challenge_size, random_state=rng + ) + challenge_set = pd.concat([challenge_in_subset, challenge_out_subset]) + + # create the test set + combined_challenge_indices = challenge_set.index + rng = np.random.RandomState(seed=seed + 39) + test_set = trans_df[~trans_df.index.isin(sampled_trans_df.index) & ~trans_df.index.isin(combined_challenge_indices)].sample( + n=test_size, random_state=rng + ) + test_with_id_file = os.path.join(subset_dir, f'test.csv') + test_set.to_csv(test_with_id_file, index=False) + + # shuffle the challenge set + challenge_set = challenge_set.sample(frac=1, random_state=subset_num).reset_index(drop=True) + challenge_with_id_file = os.path.join(subset_dir, f'challenge.csv') + challenge_set.to_csv(challenge_with_id_file, index=False) + + # create the challenge label file + labels = [] + for idx, row in challenge_set.iterrows(): + if (sampled_trans_df == row).all(axis=1).any(): + labels.append(1) + else: + labels.append(0) + + challenge_label_df = pd.DataFrame({'is_train': labels}) + challenge_label_file = os.path.join(subset_dir, f'challenge_label.csv') + challenge_label_df.to_csv(challenge_label_file, index=False) + + # drop the _id columns and save the files + challenge_clean = challenge_set.drop(columns=[col for col in challenge_set.columns if col.endswith('_id')]) + challenge_file = os.path.join(subset_dir, f'challenge_no_id.csv') + challenge_clean.to_csv(challenge_file, index=False) + + test_clean = test_set.drop(columns=[col for col in test_set.columns if col.endswith('_id')]) + test_file = os.path.join(eval_dir, f'test_no_id.csv') + test_clean.to_csv(test_file, index=False) + + # save the data json and info json files + info_files = Path(cfg.info_dir).glob("*_info.json") + for info_file in info_files: + dest_info = os.path.join(eval_dir, os.path.basename(info_file)) + shutil.copy(str(info_file), str(dest_info)) + logging.info(f"Copied {info_file} to {dest_info}") + + table_name = info_file.stem.replace('_info', '') + csv_file = Path(eval_dir) / f"{table_name}_no_id.csv" + + df = pd.read_csv(csv_file) + train_size = len(df) + + with open(dest_info, 'r') as f: + info_data = json.load(f) + + # Update fields + info_data['data_path'] = str(csv_file.resolve()) + info_data['train_num'] = train_size + + if table_name == "trans": + info_data['test_num'] = len(test_clean) + else: + info_data['test_num'] = 0 + + with open(dest_info, 'w') as f: + json.dump(info_data, f, indent=4) + + data_json_path = os.path.join(cfg.main_data_dir, f'{cfg.data_name}.json') + with open(data_json_path, 'r') as f: + data_config = json.load(f) + data_config['general']['data_dir'] = subset_dir + data_config['general']['exp_name'] = f'train_1' + data_config['general']['workspace_dir'] = os.path.join(subset_dir, 'workspace') + data_config['general']['test_data_dir'] = subset_dir + + data_json_save_path = os.path.join(subset_dir, f'{cfg.data_name}.json') + with open(data_json_save_path, 'w') as f: + json.dump(data_config, f, indent=4) + + # copy the files in files_to_copy to the subset directory + file_patterns = ['*.pkl', '*.txt', '*.json'] + for pattern in file_patterns: + # Use glob to find all files matching the pattern in the source directory recursively + pattern_path = os.path.join(cfg.main_data_dir, '**', pattern) + matching_files = glob.glob(pattern_path, recursive=True) + + print(f"Copying files matching {pattern}: {len(matching_files)} files found.") + + for file_path in matching_files: + file_name = os.path.basename(file_path) + if file_name in cfg.files_to_exclude: + print(f"Skipping excluded file: {file_name}") + continue + # Copy each file to the target directory, preserving the directory structure + target_file_path = os.path.join(subset_dir, file_name) + if os.path.exists(target_file_path): + print(f"File {target_file_path} already exists. Skipping to prevent overwrite.") + continue + shutil.copy2(file_path, target_file_path) + logger.info(f"Copied {file_path} to {target_file_path}") + + logger.info(f"Subset {subset_num} generated successfully in {subset_dir}") + + +if __name__ == "__main__": + generate_subsets() \ No newline at end of file diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/sample.py b/midst_models/multi_table_ClavaDDPM/synth_pipeline/sample.py new file mode 100644 index 0000000..f1f0a26 --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/sample.py @@ -0,0 +1,86 @@ +import os +import logging +import hydra +import numpy as np +import pickle +from omegaconf import DictConfig +from midst_models.multi_table_ClavaDDPM.complex_pipeline import ( + clava_clustering, + clava_training, + clava_load_pretrained, + clava_synthesizing, + load_configs, +) +from midst_models.multi_table_ClavaDDPM.pipeline_modules import load_multi_table +from midst_models.multi_table_ClavaDDPM.pipeline_utils import get_df_without_id +logger = logging.getLogger(__name__) + +def save_table_info(tables, relation_order, models, save_dir): + table_info = {} + for parent, child in relation_order: + result = models[(parent, child)] + df_with_cluster = tables[child]['df'] + df_without_id = get_df_without_id(df_with_cluster) + df_info = result['df_info'] + X_num_real = df_without_id[df_info['num_cols']].to_numpy().astype(float) + uniq_vals_list = [] + for col in range(X_num_real.shape[1]): + uniq_vals = np.unique(X_num_real[:, col]) + uniq_vals_list.append(uniq_vals) + table_info[child] = { + 'uniq_vals_list': uniq_vals_list, + 'size': len(df_with_cluster), + 'columns': tables[child]['df'].columns, + 'parents': tables[child]['parents'], + 'original_cols': tables[child]['original_cols'] + } + required_keys = ['num_numerical_features', 'is_regression', 'inverse_transform', 'empirical_class_dist', 'K'] + filtered_result = {key: result[key] for key in required_keys} + table_info[child].update(filtered_result) + pickle.dump(table_info, open(os.path.join(save_dir, 'synthesize_table_info.pkl'), 'wb')) + +@hydra.main(config_path="config", config_name="sample", version_base="1.1") +def main(cfg: DictConfig): + data_dir = cfg.data_dir + exp_name = cfg.exp_name + data_name = cfg.data_name + + exp_dir = os.path.join(data_dir, "workspace", exp_name) + + config_path = os.path.join(exp_dir, f"{data_name}.json") + logger.info(f"Loading configuration from: {config_path}") + configs, _ = load_configs(config_path) + + tables, relation_order, dataset_meta = load_multi_table(data_dir) + + params_clustering = configs['clustering'] + logger.info(f"{'='*20} Clustering Parameters {'='*20}") + for key, val in params_clustering.items(): + logger.info(f"{key}: {val}") + + tables, all_group_lengths_prob_dicts = clava_clustering(tables, relation_order, exp_dir, configs) + + logger.info("Loading pretrained models from: %s", exp_dir) + models = clava_load_pretrained(relation_order, exp_dir) + # save_table_info(tables, relation_order, models, exp_dir) + # exit() + + params_sampling = configs['sampling'] + logger.info(f"{'='*20} Sampling Parameters {'='*20}") + for key, val in params_sampling.items(): + logger.info(f"{key}: {val}") + + sample_scale = 1 if 'debug' not in configs else configs['debug']['sample_scale'] + cleaned_tables, synthesizing_time_spent, matching_time_spent = clava_synthesizing( + # tables, + list(tables.keys()), + relation_order, + exp_dir, + all_group_lengths_prob_dicts, + models, + configs, + sample_scale=sample_scale + ) + +if __name__ == "__main__": + main() diff --git a/midst_models/multi_table_ClavaDDPM/synth_pipeline/train.py b/midst_models/multi_table_ClavaDDPM/synth_pipeline/train.py new file mode 100644 index 0000000..211b9d0 --- /dev/null +++ b/midst_models/multi_table_ClavaDDPM/synth_pipeline/train.py @@ -0,0 +1,59 @@ +import os +import json +import logging +import shutil +import uuid +import hydra +from omegaconf import DictConfig + +import sys +import os + +from midst_models.multi_table_ClavaDDPM.complex_pipeline import ( + clava_clustering, + clava_training, + clava_load_pretrained, + clava_synthesizing, + clava_load_synthesized_data, + clava_eval, + load_configs, +) +from midst_models.multi_table_ClavaDDPM.pipeline_modules import load_multi_table +from midst_models.multi_table_ClavaDDPM.gen_single_report import gen_single_report +from midst_models.multi_table_ClavaDDPM.pipeline_utils import load_multi_table + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="train", version_base="1.1") +def main(cfg: DictConfig): + logger.info(f"running experiment: {cfg.exp_name}") + + data_dir = cfg.data_dir + logger.info(f"Data directory: {data_dir}") + + config_path = os.path.join(data_dir, f"{cfg.data_name}.json") + logger.info(f"Loading configuration from: {config_path}") + configs, _ = load_configs(config_path) + configs['general']['exp_name'] = cfg.exp_name + + save_dir = os.path.join(configs['general']['workspace_dir'], configs['general']['exp_name']) + os.makedirs(save_dir, exist_ok=True) + os.makedirs(os.path.join(save_dir, 'models'), exist_ok=True) + os.makedirs(os.path.join(save_dir, 'before_matching'), exist_ok=True) + + updated_config_path = os.path.join(save_dir, f"{cfg.data_name}.json") + with open(updated_config_path, 'w') as config_file: + json.dump(configs, config_file, indent=4) + logger.info(f"Exp configuration saved to: {updated_config_path}") + + tables, relation_order, dataset_meta = load_multi_table(data_dir) + + params_clustering = configs['clustering'] + tables, all_group_lengths_prob_dicts = clava_clustering(tables, relation_order, save_dir, configs) + + params_training = configs['diffusion'] + logger.info(f"Training parameters: {params_training}") + models = clava_training(tables, relation_order, save_dir, configs) + +if __name__ == "__main__": + main() diff --git a/midst_models/single_table_TabDDPM/slurm_scripts/tabddpm.sh b/midst_models/single_table_TabDDPM/slurm_scripts/tabddpm.sh new file mode 100755 index 0000000..6767d90 --- /dev/null +++ b/midst_models/single_table_TabDDPM/slurm_scripts/tabddpm.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +main_dataset_dir="/projects/aieng/midst_competition/data/tabddpm_wb" +log_dir="/projects/aieng/midst_competition/logs/tabddpm_wb" + +datasets=($(ls -d $main_dataset_dir/*/ | sort)) + +num_gpus=7 +batch_size=7 +num_datasets=${#datasets[@]} + +for (( i=0; i<$num_datasets; i+=batch_size )); do + batch=("${datasets[@]:i:batch_size}") + job_ids=() + for dataset in "${batch[@]}"; do + dataset_name=$(basename $dataset) + output_log="${log_dir}/${dataset_name}_%j.log" + echo "Submitting job for dataset: $dataset_name" + echo "Dataset path: $dataset" + job_id=$(sbatch --job-name="${dataset_name}" \ + --output=$output_log \ + --export=DATASET_DIR=$dataset \ + tabddpm.slrm &) + job_ids+=($job_id) + if [[ $(jobs -r -p | wc -l) -ge $num_gpus ]]; then + wait -n + fi + done + echo "Sleeping for 45 minutes before submitting the next batch..." + sleep 2700 + + # echo "Waiting for batch $((i/batch_size + 1)) to complete..." + # for job_id in "${job_ids[@]}"; do + # while squeue -j $job_id > /dev/null 2>&1; do + # echo "Job $job_id is still running. Checking again in 60 seconds." + # sleep 60 + # done + # done + echo "Batch $((i/batch_size + 1)) complete." +done diff --git a/midst_models/single_table_TabDDPM/slurm_scripts/tabddpm.slrm b/midst_models/single_table_TabDDPM/slurm_scripts/tabddpm.slrm new file mode 100644 index 0000000..f74c43a --- /dev/null +++ b/midst_models/single_table_TabDDPM/slurm_scripts/tabddpm.slrm @@ -0,0 +1,17 @@ +#!/bin/bash +#SBATCH -p rtx6000 +#SBATCH --job-name=train +#SBATCH --output=output_%j.log +#SBATCH --gres=gpu:1 +#SBATCH --qos=m +#SBATCH -c 4 +#SBATCH --mem=30G +#SBATCH --ntasks=1 +#SBATCH --time=12:00:00 + +export HYDRA_FULL_ERROR=1 +source /scratch/ssd004/scratch/malinoori/poetry/midst-competition-rGytac8L-py3.9/bin/activate +cd /fs01/home/malinoori/MIDSTModels/midst_models/single_table_TabDDPM/synth_pipeline +python train.py data_dir="$DATASET_DIR" +python sample.py data_dir="$DATASET_DIR" +python evaluate.py data_dir="$DATASET_DIR" \ No newline at end of file diff --git a/midst_models/single_table_TabDDPM/synth_pipeline/__init__.py b/midst_models/single_table_TabDDPM/synth_pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/midst_models/single_table_TabDDPM/synth_pipeline/config/evaluate.yaml b/midst_models/single_table_TabDDPM/synth_pipeline/config/evaluate.yaml new file mode 100644 index 0000000..85cb8ce --- /dev/null +++ b/midst_models/single_table_TabDDPM/synth_pipeline/config/evaluate.yaml @@ -0,0 +1,13 @@ +defaults: + - _self_ + +data_dir: /projects/aieng/midst_competition/data/tabddpm_round3/tabddpm_1 +main_data_path: /projects/aieng/midst_competition/data/tabddpm/trans.csv +data_name: "trans" +exp_name: "train_1" +eval_flags: + eval_density: True + eval_quality: True + eval_dcr: True + eval_detection: True + eval_mle: True diff --git a/midst_models/single_table_TabDDPM/synth_pipeline/config/generate_subsets.yaml b/midst_models/single_table_TabDDPM/synth_pipeline/config/generate_subsets.yaml new file mode 100644 index 0000000..84e5c2b --- /dev/null +++ b/midst_models/single_table_TabDDPM/synth_pipeline/config/generate_subsets.yaml @@ -0,0 +1,9 @@ +main_data_dir: /projects/aieng/midst_competition/data/berka/tabddpm # Path to the main data directory where the whole dataset is stored along with config templates +save_data_dir: /projects/aieng/midst_competition/data/tabddpm_round3 # Path to the save data directory where the subsets will be saved +data_name: trans # Name of the dataset +files_to_copy: ['dataset_meta.json', 'trans_domain.json', 'trans_earliest_date.txt', 'trans_label_encoders.pkl'] + +num_subsets: 40 #70 # Number of subsets to create +subset_size: 20000 # Number of rows in each subset +test_size: 10000 # Number of rows in the test set +challenge_size: 200 # Number of rows in the challenge set \ No newline at end of file diff --git a/midst_models/single_table_TabDDPM/synth_pipeline/config/sample.yaml b/midst_models/single_table_TabDDPM/synth_pipeline/config/sample.yaml new file mode 100644 index 0000000..cf9df6b --- /dev/null +++ b/midst_models/single_table_TabDDPM/synth_pipeline/config/sample.yaml @@ -0,0 +1,6 @@ +defaults: + - _self_ + +data_dir: /projects/aieng/midst_competition/data/tabddpm_round3/tabddpm_1 +data_name: "trans" +exp_name: "train_1" \ No newline at end of file diff --git a/midst_models/single_table_TabDDPM/synth_pipeline/config/train.yaml b/midst_models/single_table_TabDDPM/synth_pipeline/config/train.yaml new file mode 100644 index 0000000..ed6a7d7 --- /dev/null +++ b/midst_models/single_table_TabDDPM/synth_pipeline/config/train.yaml @@ -0,0 +1,7 @@ +defaults: + - _self_ + +data_dir: /projects/aieng/midst_competition/data/tabddpm_wb/tabddpm_1 +data_name: "trans" +# exp_name: "train_1" +exp_name: "update" \ No newline at end of file diff --git a/midst_models/single_table_TabDDPM/synth_pipeline/evaluate.py b/midst_models/single_table_TabDDPM/synth_pipeline/evaluate.py new file mode 100644 index 0000000..7841b93 --- /dev/null +++ b/midst_models/single_table_TabDDPM/synth_pipeline/evaluate.py @@ -0,0 +1,99 @@ +import os +import json +import torch +import hydra +import logging +from omegaconf import DictConfig +from pprint import pprint + +from midst_models.single_table_TabSyn.scripts.eval.eval_density import eval_density +from midst_models.single_table_TabSyn.scripts.eval.eval_quality import eval_quality +from midst_models.single_table_TabSyn.scripts.eval.eval_mle import eval_mle +from midst_models.single_table_TabSyn.scripts.eval.eval_dcr import eval_dcr +from midst_models.single_table_TabSyn.scripts.eval.eval_detection import eval_detection + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="evaluate", version_base="1.1") +def main(cfg: DictConfig): + data_dir = cfg.data_dir + exp_name = cfg.exp_name + data_name = cfg.data_name + workspace_dir = os.path.join(data_dir, "workspace", exp_name) + + main_data_path = cfg.main_data_path + train_data_path = os.path.join(data_dir, "train.csv") + test_data_path = os.path.join(data_dir, "test.csv") + synth_data_path = os.path.join(workspace_dir, data_name, "_final", f"{data_name}_synthetic.csv") + + + info_path = os.path.join(data_dir, "info.json") + + eval_results = {} + + # 1. Density Evaluation + if cfg.eval_flags.eval_density: + logger.info("Running density evaluation for train data...") + shape, trend = eval_density(synth_data_path, train_data_path, info_path) + eval_results['density_shape_train'] = shape + eval_results['density_trend_train'] = trend + logger.info(f"Shape: {shape}, Trend: {trend}") + + logger.info("Running density evaluation for all data...") + shape, trend = eval_density(synth_data_path, main_data_path, info_path) + eval_results['density_shape_all'] = shape + eval_results['density_trend_all'] = trend + logger.info(f"Shape: {shape}, Trend: {trend}") + + # 2. Quality Evaluation + if cfg.eval_flags.eval_quality: + logger.info("Running quality evaluation...") + alpha_precision, beta_recall = eval_quality(synth_data_path, train_data_path, info_path) + eval_results['alpha_precision'] = alpha_precision + eval_results['beta_recall'] = beta_recall + logger.info(f"Alpha Precision: {alpha_precision}, Beta Recall: {beta_recall}") + + # 3. DCR (Distance to Closest Record) Evaluation + if cfg.eval_flags.eval_dcr: + logger.info("Running DCR evaluation...") + with open(info_path, "r") as file: + data_info = json.load(file) + ideal_dcr = data_info["train_num"] / (data_info["train_num"] + data_info["test_num"]) + + dcr_score = eval_dcr(synth_data_path, train_data_path, test_data_path, info_path) + eval_results['dcr_score'] = dcr_score + eval_results['ideal_dcr'] = ideal_dcr + logger.info(f"DCR Score: {dcr_score}, Ideal DCR: {ideal_dcr}") + + # 4. Detection Evaluation + if cfg.eval_flags.eval_detection: + logger.info("Running detection evaluation for train data...") + detection_score = eval_detection(synth_data_path, train_data_path, info_path, data_name, model="tabddpm") + eval_results['detection_score_train'] = detection_score + logger.info(f"Detection Score: {detection_score}") + + logger.info("Running detection evaluation for all data...") + detection_score = eval_detection(synth_data_path, main_data_path, info_path, data_name, model="tabddpm") + eval_results['detection_score_all'] = detection_score + logger.info(f"Detection Score: {detection_score}") + + # 5. MLE Evaluation + if cfg.eval_flags.eval_mle: + logger.info("Running MLE evaluation...") + mle_synth_score = eval_mle(synth_data_path, test_data_path, info_path) + mle_baseline_score = eval_mle(train_data_path, test_data_path, info_path) + eval_results['mle_synth_score'] = mle_synth_score + eval_results['mle_baseline_score'] = mle_baseline_score + logger.info("MLE Scores:") + logger.info(f"Synthetic: {mle_synth_score}, Baseline: {mle_baseline_score}") + + # Save evaluation results to a file + eval_json_path = os.path.join(workspace_dir, f"{data_name}_eval.json") + with open(eval_json_path, 'w') as eval_file: + json.dump(eval_results, eval_file, indent=4) + + logger.info(f"Evaluation results saved to {eval_json_path}") + + +if __name__ == "__main__": + main() diff --git a/midst_models/single_table_TabDDPM/synth_pipeline/generate_subsets.py b/midst_models/single_table_TabDDPM/synth_pipeline/generate_subsets.py new file mode 100644 index 0000000..c6040b3 --- /dev/null +++ b/midst_models/single_table_TabDDPM/synth_pipeline/generate_subsets.py @@ -0,0 +1,138 @@ +import os +import json +import pandas as pd +import numpy as np +import hydra +import logging +import shutil +from omegaconf import DictConfig +from sklearn.model_selection import train_test_split + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="generate_subsets", version_base="1.1") +def generate_subsets(cfg: DictConfig): + + num_subsets = cfg.num_subsets + subset_size = cfg.subset_size + test_size = cfg.test_size + challenge_size = cfg.challenge_size // 2 + + os.makedirs(cfg.save_data_dir, exist_ok=True) + + main_file = os.path.join(cfg.main_data_dir, f'{cfg.data_name}.csv') + df = pd.read_csv(main_file) + + main_clean = df.drop(columns=[col for col in df.columns if col.endswith('_id')]) + main_clean_file = os.path.join(cfg.save_data_dir, f'{cfg.data_name}.csv') + main_clean.to_csv(main_clean_file, index=False) + logger.info(f"Main dataset (without _id columns) saved at {main_clean_file}") + + for subset_num in range(111, num_subsets + 111): + subset_dir = os.path.join(cfg.save_data_dir, f'tabddpm_{subset_num}') + os.makedirs(subset_dir, exist_ok=True) + + # seed = subset_num * 100 + 4 + # seed = subset_num * 100 + 12 + seed = subset_num * 100 + 24 + rng = np.random.RandomState(seed=seed) + # create the train set + subset = df.sample(n=subset_size, replace=False, random_state=rng) + + train_with_id_file = os.path.join(subset_dir, f'train_with_id.csv') + subset.to_csv(train_with_id_file, index=False) + + # create the challenge set + # rng = np.random.RandomState(seed=seed + 5) + # rng = np.random.RandomState(seed=seed + 13) + rng = np.random.RandomState(seed=seed + 25) + challenge_in_subset = subset.sample(n=challenge_size, random_state=rng) + + # rng = np.random.RandomState(seed=seed + 6) + # rng = np.random.RandomState(seed=seed + 14) + rng = np.random.RandomState(seed=seed + 26) + challenge_out_subset = df[~df.index.isin(subset.index)].sample( + n=challenge_size, random_state=rng + ) + challenge_set = pd.concat([challenge_in_subset, challenge_out_subset]) + + # create the test set + combined_challenge_indices = challenge_set.index + # rng = np.random.RandomState(seed=seed + 7) + # rng = np.random.RandomState(seed=seed + 15) + rng = np.random.RandomState(seed=seed + 27) + test_set = df[~df.index.isin(subset.index) & ~df.index.isin(combined_challenge_indices)].sample( + n=test_size, random_state=rng + ) + test_with_id_file = os.path.join(subset_dir, f'test_with_id.csv') + test_set.to_csv(test_with_id_file, index=False) + + # shuffle the challenge set + challenge_set = challenge_set.sample(frac=1, random_state=subset_num).reset_index(drop=True) + challenge_with_id_file = os.path.join(subset_dir, f'challenge_with_id.csv') + challenge_set.to_csv(challenge_with_id_file, index=False) + + # create the challenge label file + labels = [] + for idx, row in challenge_set.iterrows(): + if (subset == row).all(axis=1).any(): + labels.append(1) + else: + labels.append(0) + + challenge_label_df = pd.DataFrame({'is_train': labels}) + challenge_label_file = os.path.join(subset_dir, f'challenge_label.csv') + challenge_label_df.to_csv(challenge_label_file, index=False) + + # drop the _id columns and save the files + subset_clean = subset.drop(columns=[col for col in subset.columns if col.endswith('_id')]) + train_file = os.path.join(subset_dir, f'train.csv') + subset_clean.to_csv(train_file, index=False) + + challenge_clean = challenge_set.drop(columns=[col for col in challenge_set.columns if col.endswith('_id')]) + challenge_file = os.path.join(subset_dir, f'challenge.csv') + challenge_clean.to_csv(challenge_file, index=False) + + test_clean = test_set.drop(columns=[col for col in test_set.columns if col.endswith('_id')]) + test_file = os.path.join(subset_dir, f'test.csv') + test_clean.to_csv(test_file, index=False) + + # save the data json and info json files + data_json_path = os.path.join(cfg.main_data_dir, f'{cfg.data_name}.json') + with open(data_json_path, 'r') as f: + data_config = json.load(f) + data_config['general']['data_dir'] = subset_dir + data_config['general']['exp_name'] = f'tabddpm_{subset_num}' + data_config['general']['workspace_dir'] = os.path.join(subset_dir, 'workspace') + data_config['general']['test_data_dir'] = subset_dir + + data_json_save_path = os.path.join(subset_dir, f'{cfg.data_name}.json') + with open(data_json_save_path, 'w') as f: + json.dump(data_config, f, indent=4) + + info_json_path = os.path.join(cfg.main_data_dir, 'info.json') + with open(info_json_path, 'r') as f: + info_config = json.load(f) + info_config['data_path'] = train_file + info_config['train_num'] = len(subset_clean) + info_config['test_num'] = len(test_clean) + + info_json_save_path = os.path.join(subset_dir, 'info.json') + with open(info_json_save_path, 'w') as f: + json.dump(info_config, f, indent=4) + + # copy the files in files_to_copy to the subset directory + for file_name in cfg.files_to_copy: + src_file = os.path.join(cfg.main_data_dir, file_name) + dest_file = os.path.join(subset_dir, file_name) + if os.path.exists(src_file): + shutil.copy(src_file, dest_file) + logger.info(f"Copied {file_name} to {dest_file}") + else: + logger.warning(f"{file_name} not found in {cfg.main_data_dir}") + + logger.info(f"Subset {subset_num} generated successfully in {subset_dir}") + + +if __name__ == "__main__": + generate_subsets() diff --git a/midst_models/single_table_TabDDPM/synth_pipeline/sample.py b/midst_models/single_table_TabDDPM/synth_pipeline/sample.py new file mode 100644 index 0000000..01b26ea --- /dev/null +++ b/midst_models/single_table_TabDDPM/synth_pipeline/sample.py @@ -0,0 +1,58 @@ +import os +import logging +import hydra +from omegaconf import DictConfig +from midst_models.single_table_TabDDPM.complex_pipeline import ( + clava_clustering, + clava_training, + clava_load_pretrained, + clava_synthesizing, + load_configs, +) +from midst_models.single_table_TabDDPM.pipeline_modules import load_multi_table + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="sample", version_base="1.1") +def main(cfg: DictConfig): + data_dir = cfg.data_dir + exp_name = cfg.exp_name + data_name = cfg.data_name + + exp_dir = os.path.join(data_dir, "workspace", exp_name) + + config_path = os.path.join(exp_dir, f"{data_name}.json") + logger.info(f"Loading configuration from: {config_path}") + configs, _ = load_configs(config_path) + + tables, relation_order, dataset_meta = load_multi_table(data_dir) + + params_clustering = configs['clustering'] + logger.info(f"{'='*20} Clustering Parameters {'='*20}") + for key, val in params_clustering.items(): + logger.info(f"{key}: {val}") + + tables, all_group_lengths_prob_dicts = clava_clustering(tables, relation_order, exp_dir, configs) + + logger.info("Loading pretrained models from: %s", exp_dir) + models = clava_load_pretrained(relation_order, exp_dir) + + params_sampling = configs['sampling'] + logger.info(f"{'='*20} Sampling Parameters {'='*20}") + for key, val in params_sampling.items(): + logger.info(f"{key}: {val}") + + sample_scale = 1 if 'debug' not in configs else configs['debug']['sample_scale'] + cleaned_tables, synthesizing_time_spent, matching_time_spent = clava_synthesizing( + # tables, + list(tables.keys()), + relation_order, + exp_dir, + all_group_lengths_prob_dicts, + models, + configs, + sample_scale=sample_scale + ) + +if __name__ == "__main__": + main() diff --git a/midst_models/single_table_TabDDPM/synth_pipeline/train.py b/midst_models/single_table_TabDDPM/synth_pipeline/train.py new file mode 100644 index 0000000..794d5c8 --- /dev/null +++ b/midst_models/single_table_TabDDPM/synth_pipeline/train.py @@ -0,0 +1,59 @@ +import os +import json +import logging +import shutil +import uuid +import hydra +from omegaconf import DictConfig + +import sys +import os + +from midst_models.single_table_TabDDPM.complex_pipeline import ( + clava_clustering, + clava_training, + clava_load_pretrained, + clava_synthesizing, + clava_load_synthesized_data, + clava_eval, + load_configs, +) +from midst_models.single_table_TabDDPM.pipeline_modules import load_multi_table +from midst_models.single_table_TabDDPM.gen_single_report import gen_single_report +from midst_models.single_table_TabDDPM.pipeline_utils import load_multi_table + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="train", version_base="1.1") +def main(cfg: DictConfig): + logger.info(f"running experiment: {cfg.exp_name}") + + data_dir = cfg.data_dir + logger.info(f"Data directory: {data_dir}") + + config_path = os.path.join(data_dir, f"{cfg.data_name}.json") + logger.info(f"Loading configuration from: {config_path}") + configs, _ = load_configs(config_path) + configs['general']['exp_name'] = cfg.exp_name + + save_dir = os.path.join(configs['general']['workspace_dir'], configs['general']['exp_name']) + os.makedirs(save_dir, exist_ok=True) + os.makedirs(os.path.join(save_dir, 'models'), exist_ok=True) + os.makedirs(os.path.join(save_dir, 'before_matching'), exist_ok=True) + + updated_config_path = os.path.join(save_dir, f"{cfg.data_name}.json") + with open(updated_config_path, 'w') as config_file: + json.dump(configs, config_file, indent=4) + logger.info(f"Exp configuration saved to: {updated_config_path}") + + tables, relation_order, dataset_meta = load_multi_table(data_dir) + + params_clustering = configs['clustering'] + tables, all_group_lengths_prob_dicts = clava_clustering(tables, relation_order, save_dir, configs) + + params_training = configs['diffusion'] + logger.info(f"Training parameters: {params_training}") + models = clava_training(tables, relation_order, save_dir, configs) + +if __name__ == "__main__": + main() diff --git a/midst_models/single_table_TabSyn/slurm_scripts/tabsyn.sh b/midst_models/single_table_TabSyn/slurm_scripts/tabsyn.sh new file mode 100755 index 0000000..c9451d7 --- /dev/null +++ b/midst_models/single_table_TabSyn/slurm_scripts/tabsyn.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +main_dataset_dir="/projects/aieng/midst_competition/data/tabsyn_round3" +log_dir="/projects/aieng/midst_competition/logs/tabsyn_round3" + +datasets=($(ls -d $main_dataset_dir/*/ | grep -v "all_data" | sort)) + +num_gpus=7 +batch_size=7 +num_datasets=${#datasets[@]} + +for (( i=0; i<$num_datasets; i+=batch_size )); do + batch=("${datasets[@]:i:batch_size}") + job_ids=() + for dataset in "${batch[@]}"; do + dataset_name=$(basename $dataset) + output_log="${log_dir}/${dataset_name}_%j.log" + echo "Submitting job for dataset: $dataset_name" + echo "Dataset path: $dataset" + job_id=$(sbatch --job-name="${dataset_name}" \ + --output=$output_log \ + --export=DATASET_DIR=$dataset \ + tabsyn.slrm &) + job_ids+=($job_id) + if [[ $(jobs -r -p | wc -l) -ge $num_gpus ]]; then + wait -n + fi + done + echo "Sleeping for 50 minutes before submitting the next batch..." + sleep 3000 + + # echo "Waiting for batch $((i/batch_size + 1)) to complete..." + # for job_id in "${job_ids[@]}"; do + # while squeue -j $job_id > /dev/null 2>&1; do + # echo "Job $job_id is still running. Checking again in 60 seconds." + # sleep 60 + # done + # done + echo "Batch $((i/batch_size + 1)) complete." +done diff --git a/midst_models/single_table_TabSyn/slurm_scripts/tabsyn.slrm b/midst_models/single_table_TabSyn/slurm_scripts/tabsyn.slrm new file mode 100644 index 0000000..9f64a33 --- /dev/null +++ b/midst_models/single_table_TabSyn/slurm_scripts/tabsyn.slrm @@ -0,0 +1,17 @@ +#!/bin/bash +#SBATCH -p a40 +#SBATCH --job-name=train +#SBATCH --output=output_%j.log +#SBATCH --gres=gpu:1 +#SBATCH --qos=m +#SBATCH -c 4 +#SBATCH --mem=30G +#SBATCH --ntasks=1 +#SBATCH --time=12:00:00 + +export HYDRA_FULL_ERROR=1 +source /scratch/ssd004/scratch/malinoori/poetry/midst-competition-rGytac8L-py3.9/bin/activate +cd /fs01/home/malinoori/MIDSTModels/midst_models/single_table_TabSyn/synth_pipeline +python train.py data_dir="$DATASET_DIR" +python sample.py data_dir="$DATASET_DIR" +python evaluate.py data_dir="$DATASET_DIR" \ No newline at end of file diff --git a/midst_models/single_table_TabSyn/synth_pipeline/__init__.py b/midst_models/single_table_TabSyn/synth_pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/midst_models/single_table_TabSyn/synth_pipeline/config/evaluate.yaml b/midst_models/single_table_TabSyn/synth_pipeline/config/evaluate.yaml new file mode 100644 index 0000000..0db5c9b --- /dev/null +++ b/midst_models/single_table_TabSyn/synth_pipeline/config/evaluate.yaml @@ -0,0 +1,13 @@ +defaults: + - _self_ + +data_dir: /projects/aieng/midst_competition/data/tabsyn_round3/tabsyn_1 +main_data_path: /projects/aieng/midst_competition/data/tabsyn/trans.csv +data_name: "trans" +exp_name: "train_1" +eval_flags: + eval_density: True + eval_quality: True + eval_dcr: True + eval_detection: True + eval_mle: True diff --git a/midst_models/single_table_TabSyn/synth_pipeline/config/generate_subsets.yaml b/midst_models/single_table_TabSyn/synth_pipeline/config/generate_subsets.yaml new file mode 100644 index 0000000..91abe1d --- /dev/null +++ b/midst_models/single_table_TabSyn/synth_pipeline/config/generate_subsets.yaml @@ -0,0 +1,10 @@ +main_data_dir: /projects/aieng/midst_competition/data/berka/tabsyn # Path to the main data directory where the whole dataset is stored along with config templates +save_data_dir: /projects/aieng/midst_competition/data/tabsyn_round3 # Path to the save data directory where the subsets will be saved +main_info_file: /projects/aieng/midst_competition/data/tabsyn/all_data/processed_data/trans/info.json +data_name: trans # Name of the dataset +files_to_copy: ['trans.toml'] + +num_subsets: 40 # Number of subsets to create +subset_size: 20000 # Number of rows in each subset +test_size: 10000 # Number of rows in the test set +challenge_size: 200 # Number of rows in the challenge set \ No newline at end of file diff --git a/midst_models/single_table_TabSyn/synth_pipeline/config/sample.yaml b/midst_models/single_table_TabSyn/synth_pipeline/config/sample.yaml new file mode 100644 index 0000000..7289155 --- /dev/null +++ b/midst_models/single_table_TabSyn/synth_pipeline/config/sample.yaml @@ -0,0 +1,7 @@ +defaults: + - _self_ + +main_data_path: "/projects/aieng/midst_competition/data/tabsyn/all_data/processed_data/trans/" # Path to the main data that have all the categories +data_dir: /projects/aieng/midst_competition/data/tabsyn_round3/tabsyn_1 +data_name: "trans" +exp_name: "train_1" \ No newline at end of file diff --git a/midst_models/single_table_TabSyn/synth_pipeline/config/train.yaml b/midst_models/single_table_TabSyn/synth_pipeline/config/train.yaml new file mode 100644 index 0000000..75bf4b1 --- /dev/null +++ b/midst_models/single_table_TabSyn/synth_pipeline/config/train.yaml @@ -0,0 +1,7 @@ +defaults: + - _self_ + +main_data_path: "/projects/aieng/midst_competition/data/tabsyn/all_data/processed_data/trans/" # Path to the main data that have all the categories +data_dir: /projects/aieng/midst_competition/data/tabsyn_round3/tabsyn_1 # Path to the directory where the data will be saved +data_name: "trans" # Name of the data +exp_name: "train_1" # Name of the experiment run diff --git a/midst_models/single_table_TabSyn/synth_pipeline/evaluate.py b/midst_models/single_table_TabSyn/synth_pipeline/evaluate.py new file mode 100644 index 0000000..76b4d4f --- /dev/null +++ b/midst_models/single_table_TabSyn/synth_pipeline/evaluate.py @@ -0,0 +1,99 @@ +import os +import json +import torch +import hydra +import logging +from omegaconf import DictConfig +from pprint import pprint + +from midst_models.single_table_TabSyn.scripts.eval.eval_density import eval_density +from midst_models.single_table_TabSyn.scripts.eval.eval_quality import eval_quality +from midst_models.single_table_TabSyn.scripts.eval.eval_mle import eval_mle +from midst_models.single_table_TabSyn.scripts.eval.eval_dcr import eval_dcr +from midst_models.single_table_TabSyn.scripts.eval.eval_detection import eval_detection + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="evaluate", version_base="1.1") +def main(cfg: DictConfig): + data_dir = cfg.data_dir + exp_name = cfg.exp_name + data_name = cfg.data_name + workspace_dir = os.path.join(data_dir, "workspace", exp_name) + + main_data_path = cfg.main_data_path + processed_data_dir = os.path.join(data_dir, "processed_data", data_name) + train_data_path = os.path.join(processed_data_dir, "train.csv") + test_data_path = os.path.join(processed_data_dir, "test.csv") + synth_data_path = os.path.join(workspace_dir, data_name, "_final", f"{data_name}_synthetic.csv") + + info_path = os.path.join(processed_data_dir, "info.json") + + eval_results = {} + + # 1. Density Evaluation + if cfg.eval_flags.eval_density: + logger.info("Running density evaluation for train data...") + shape, trend = eval_density(synth_data_path, train_data_path, info_path) + eval_results['density_shape_train'] = shape + eval_results['density_trend_train'] = trend + logger.info(f"Shape: {shape}, Trend: {trend}") + + logger.info("Running density evaluation for all data...") + shape, trend = eval_density(synth_data_path, main_data_path, info_path) + eval_results['density_shape_all'] = shape + eval_results['density_trend_all'] = trend + logger.info(f"Shape: {shape}, Trend: {trend}") + + # 2. Quality Evaluation + if cfg.eval_flags.eval_quality: + logger.info("Running quality evaluation...") + alpha_precision, beta_recall = eval_quality(synth_data_path, train_data_path, info_path) + eval_results['alpha_precision'] = alpha_precision + eval_results['beta_recall'] = beta_recall + logger.info(f"Alpha Precision: {alpha_precision}, Beta Recall: {beta_recall}") + + # 3. DCR (Distance to Closest Record) Evaluation + if cfg.eval_flags.eval_dcr: + logger.info("Running DCR evaluation...") + with open(info_path, "r") as file: + data_info = json.load(file) + ideal_dcr = data_info["train_num"] / (data_info["train_num"] + data_info["test_num"]) + + dcr_score = eval_dcr(synth_data_path, train_data_path, test_data_path, info_path) + eval_results['dcr_score'] = dcr_score + eval_results['ideal_dcr'] = ideal_dcr + logger.info(f"DCR Score: {dcr_score}, Ideal DCR: {ideal_dcr}") + + # 4. Detection Evaluation + if cfg.eval_flags.eval_detection: + logger.info("Running detection evaluation for train data...") + detection_score = eval_detection(synth_data_path, train_data_path, info_path, data_name, model="tabddpm") + eval_results['detection_score_train'] = detection_score + logger.info(f"Detection Score: {detection_score}") + + logger.info("Running detection evaluation for all data...") + detection_score = eval_detection(synth_data_path, main_data_path, info_path, data_name, model="tabddpm") + eval_results['detection_score_all'] = detection_score + logger.info(f"Detection Score: {detection_score}") + + # 5. MLE Evaluation + if cfg.eval_flags.eval_mle: + logger.info("Running MLE evaluation...") + mle_synth_score = eval_mle(synth_data_path, test_data_path, info_path) + mle_baseline_score = eval_mle(train_data_path, test_data_path, info_path) + eval_results['mle_synth_score'] = mle_synth_score + eval_results['mle_baseline_score'] = mle_baseline_score + logger.info("MLE Scores:") + logger.info(f"Synthetic: {mle_synth_score}, Baseline: {mle_baseline_score}") + + # Save evaluation results to a file + eval_json_path = os.path.join(workspace_dir, f"{data_name}_eval.json") + with open(eval_json_path, 'w') as eval_file: + json.dump(eval_results, eval_file, indent=4) + + logger.info(f"Evaluation results saved to {eval_json_path}") + + +if __name__ == "__main__": + main() diff --git a/midst_models/single_table_TabSyn/synth_pipeline/generate_subsets.py b/midst_models/single_table_TabSyn/synth_pipeline/generate_subsets.py new file mode 100644 index 0000000..a5bcc05 --- /dev/null +++ b/midst_models/single_table_TabSyn/synth_pipeline/generate_subsets.py @@ -0,0 +1,140 @@ +import os +import json +import pandas as pd +import numpy as np +import hydra +import logging +import shutil +from omegaconf import DictConfig +from sklearn.model_selection import train_test_split + +from midst_models.single_table_TabSyn.scripts.process_dataset import process_data + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="generate_subsets", version_base="1.1") +def generate_subsets(cfg: DictConfig): + + num_subsets = cfg.num_subsets + subset_size = cfg.subset_size + test_size = cfg.test_size + challenge_size = cfg.challenge_size // 2 + + os.makedirs(cfg.save_data_dir, exist_ok=True) + + main_file = os.path.join(cfg.main_data_dir, f'{cfg.data_name}.csv') + df = pd.read_csv(main_file) + + main_info = json.load(open(cfg.main_info_file, 'r')) + + main_clean = df.drop(columns=[col for col in df.columns if col.endswith('_id')]) + main_clean_file = os.path.join(cfg.save_data_dir, f'{cfg.data_name}.csv') + main_clean.to_csv(main_clean_file, index=False) + logger.info(f"Main dataset (without _id columns) saved at {main_clean_file}") + + for subset_num in range(111, num_subsets + 111): + subset_dir = os.path.join(cfg.save_data_dir, f'tabsyn_{subset_num}') + os.makedirs(subset_dir, exist_ok=True) + raw_data_dir = os.path.join(subset_dir, 'raw_data') + os.makedirs(raw_data_dir, exist_ok=True) + processed_data_dir = os.path.join(subset_dir, 'processed_data') + os.makedirs(processed_data_dir, exist_ok=True) + + seed = subset_num * 100 + # rng = np.random.RandomState(seed=seed + 16) + rng = np.random.RandomState(seed=seed + 28) + # create the train set + subset = df.sample(n=subset_size, replace=False, random_state=rng) + train_with_id_file = os.path.join(subset_dir, f'train_with_id.csv') + subset.to_csv(train_with_id_file, index=False) + + # create the challenge set + # rng = np.random.RandomState(seed=seed + 1) + rng = np.random.RandomState(seed=seed + 29) + challenge_in_subset = subset.sample(n=challenge_size, random_state=rng) + + # rng = np.random.RandomState(seed=seed + 2) + rng = np.random.RandomState(seed=seed + 30) + challenge_out_subset = df[~df.index.isin(subset.index)].sample( + n=challenge_size, random_state=rng + ) + challenge_set = pd.concat([challenge_in_subset, challenge_out_subset]) + + # create the test set + combined_challenge_indices = challenge_set.index + # rng = np.random.RandomState(seed=seed + 3) + rng = np.random.RandomState(seed=seed + 31) + test_set = df[~df.index.isin(subset.index) & ~df.index.isin(combined_challenge_indices)].sample( + n=test_size, random_state=rng + ) + test_with_id_file = os.path.join(subset_dir, f'test_with_id.csv') + test_set.to_csv(test_with_id_file, index=False) + + # shuffle the challenge set + challenge_set = challenge_set.sample(frac=1, random_state=subset_num).reset_index(drop=True) + challenge_with_id_file = os.path.join(subset_dir, f'challenge_with_id.csv') + challenge_set.to_csv(challenge_with_id_file, index=False) + + # create the challenge label file + labels = [] + for idx, row in challenge_set.iterrows(): + if (subset == row).all(axis=1).any(): + labels.append(1) + else: + labels.append(0) + + challenge_label_df = pd.DataFrame({'is_train': labels}) + challenge_label_file = os.path.join(subset_dir, f'challenge_label.csv') + challenge_label_df.to_csv(challenge_label_file, index=False) + + # drop the _id columns and save the files + challenge_clean = challenge_set.drop(columns=[col for col in challenge_set.columns if col.endswith('_id')]) + challenge_file = os.path.join(subset_dir, f'challenge.csv') + challenge_clean.to_csv(challenge_file, index=False) + + subset_clean = subset.drop(columns=[col for col in subset.columns if col.endswith('_id')]) + train_file = os.path.join(raw_data_dir, f'train.csv') + subset_clean.to_csv(train_file, index=False) + + test_clean = test_set.drop(columns=[col for col in test_set.columns if col.endswith('_id')]) + test_file = os.path.join(raw_data_dir, f'test.csv') + test_clean.to_csv(test_file, index=False) + + info_path = os.path.join(cfg.main_data_dir, 'data_info', f'{cfg.data_name}.json') + with open(info_path, 'r') as f: + info_config = json.load(f) + info_config['data_path'] = train_file + info_config['test_path'] = test_file + + info_save_path = os.path.join(subset_dir, f'{cfg.data_name}.json') + with open(info_save_path, 'w') as f: + json.dump(info_config, f, indent=4) + + process_data(cfg.data_name, subset_dir, subset_dir) + info_file = os.path.join(subset_dir, 'processed_data', cfg.data_name, 'info.json') + info_config = json.load(open(info_file, 'r')) + info_config['column_info'] = main_info['column_info'] + with open(info_file, 'w') as f: + json.dump(info_config, f, indent=4) + + info_file = os.path.join(subset_dir, 'processed_data', cfg.data_name, 'info.json') + info_config = json.load(open(info_file, 'r')) + info_config['column_info'] = main_info['column_info'] + with open(info_file, 'w') as f: + json.dump(info_config, f, indent=4) + + # copy the files in files_to_copy to the subset directory + for file_name in cfg.files_to_copy: + src_file = os.path.join(cfg.main_data_dir, file_name) + dest_file = os.path.join(subset_dir, file_name) + if os.path.exists(src_file): + shutil.copy(src_file, dest_file) + logger.info(f"Copied {file_name} to {dest_file}") + else: + logger.warning(f"{file_name} not found in {cfg.main_data_dir}") + + logger.info(f"Subset {subset_num} generated successfully in {subset_dir}") + + +if __name__ == "__main__": + generate_subsets() diff --git a/midst_models/single_table_TabSyn/synth_pipeline/sample.py b/midst_models/single_table_TabSyn/synth_pipeline/sample.py new file mode 100644 index 0000000..0c39a89 --- /dev/null +++ b/midst_models/single_table_TabSyn/synth_pipeline/sample.py @@ -0,0 +1,123 @@ +import os +import json +import logging +import torch +from omegaconf import DictConfig +import hydra +from torch.utils.data import DataLoader + +from midst_models.single_table_TabSyn.src.data import preprocess, TabularDataset +from midst_models.single_table_TabSyn.src.tabsyn.pipeline import TabSyn +from midst_models.single_table_TabSyn.src import load_config + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="train", version_base="1.1") +def main(cfg: DictConfig): + data_dir = cfg.data_dir + exp_name = cfg.exp_name + data_name = cfg.data_name + ref_data_path = cfg.main_data_path + + exp_dir = os.path.join(data_dir, "workspace", exp_name) + latent_embeddings_dir = os.path.join(exp_dir, "vae") + synth_dir = os.path.join(exp_dir, data_name, "_final") + os.makedirs(synth_dir, exist_ok=True) + synth_data_path = os.path.join(synth_dir, f"{data_name}_synthetic.csv") + + config_path = os.path.join(data_dir, f"{data_name}.toml") + logger.info(f"Loading configuration from: {config_path}") + raw_config = load_config(config_path) + + processed_data_dir = os.path.join(data_dir, "processed_data", data_name) + info_file_path = os.path.join(processed_data_dir, "info.json") + + # Preprocess the data + X_num, X_cat, categories, d_numerical = preprocess( + processed_data_dir, + ref_dataset_path=ref_data_path, + transforms=raw_config["transforms"], + task_type=raw_config["task_type"], + ) + + # Separate train and test data + X_train_num, X_test_num = X_num + X_train_cat, X_test_cat = X_cat + + # Convert data to float tensors + X_train_num, X_test_num = torch.tensor(X_train_num).float(), torch.tensor(X_test_num).float() + X_train_cat, X_test_cat = torch.tensor(X_train_cat), torch.tensor(X_test_cat) + + # Create dataset module + train_data = TabularDataset(X_train_num.float(), X_train_cat) + + # Set up device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + X_test_num = X_test_num.float().to(device) + X_test_cat = X_test_cat.to(device) + + # Create train DataLoader + train_loader = DataLoader( + train_data, + batch_size=raw_config["train"]["vae"]["batch_size"], + shuffle=True, + num_workers=raw_config["train"]["vae"]["num_dataset_workers"], + ) + + tabsyn = TabSyn( + train_loader, + X_test_num, + X_test_cat, + num_numerical_features = d_numerical, + num_classes = categories, + device = device + ) + + # Instantiate VAE + tabsyn.instantiate_vae(**raw_config["model_params"], optim_params=None) + + # Load latent embeddings of input data + train_z, token_dim = tabsyn.load_latent_embeddings(latent_embeddings_dir) + logger.info(f"Loaded latent embeddings with token dim: {token_dim}") + + # Instantiate diffusion model + tabsyn.instantiate_diffusion(in_dim=train_z.shape[1], hid_dim=train_z.shape[1], optim_params=None) + logger.info("Instantiated diffusion model") + + # Load state from checkpoint + tabsyn.load_model_state(ckpt_dir=exp_dir, dif_ckpt_name="model.pt") + logger.info(f"Loaded model state from {exp_dir}") + + with open(info_file_path, "r") as file: + data_info = json.load(file) + data_info["token_dim"] = token_dim + logger.info(f"Loaded data info from {info_file_path}") + + # Get inverse tokenizers + _, _, categories, d_numerical, num_inverse, cat_inverse = preprocess( + processed_data_dir, + ref_dataset_path=ref_data_path, + transforms=raw_config["transforms"], + task_type=raw_config["task_type"], + inverse=True + ) + logger.info(f"Retrieved inverse tokenizers for {data_name}") + + # Sample data + num_samples = train_z.shape[0] + in_dim = train_z.shape[1] + mean_input_emb = train_z.mean(0) + + tabsyn.sample( + num_samples=num_samples, + in_dim=in_dim, + mean_input_emb=mean_input_emb, + info=data_info, + num_inverse=num_inverse, + cat_inverse=cat_inverse, + save_path=synth_data_path, + ) + logger.info(f"Sampled data saved to {synth_data_path}") + +if __name__ == "__main__": + main() diff --git a/midst_models/single_table_TabSyn/synth_pipeline/train.py b/midst_models/single_table_TabSyn/synth_pipeline/train.py new file mode 100644 index 0000000..e179339 --- /dev/null +++ b/midst_models/single_table_TabSyn/synth_pipeline/train.py @@ -0,0 +1,118 @@ +import os +import logging +import torch +from omegaconf import DictConfig +import hydra +from torch.utils.data import DataLoader + +from midst_models.single_table_TabSyn.src.data import preprocess, TabularDataset +from midst_models.single_table_TabSyn.src.tabsyn.pipeline import TabSyn +from midst_models.single_table_TabSyn.src import load_config + +logger = logging.getLogger(__name__) + +@hydra.main(config_path="config", config_name="train", version_base="1.1") +def main(cfg: DictConfig): + logger.info(f"running experiment: {cfg.exp_name}") + + data_dir = cfg.data_dir + data_name = cfg.data_name + exp_name = cfg.exp_name + ref_data_path = cfg.main_data_path + processed_data_dir = os.path.join(data_dir, "processed_data", data_name) + logger.info(f"Data directory: {data_dir}") + + # Load configuration for the task + config_path = os.path.join(data_dir, f"{data_name}.toml") + logger.info(f"Loading configuration from: {config_path}") + raw_config = load_config(config_path) + + save_dir = os.path.join(data_dir, "workspace", exp_name) + os.makedirs(save_dir, exist_ok=True) + vae_dir = os.path.join(save_dir, "vae") + os.makedirs(vae_dir, exist_ok=True) + + # Preprocess the data + X_num, X_cat, categories, d_numerical = preprocess( + processed_data_dir, + ref_dataset_path=ref_data_path, + transforms=raw_config["transforms"], + task_type=raw_config["task_type"], + ) + + # Separate train and test data + X_train_num, X_test_num = X_num + X_train_cat, X_test_cat = X_cat + + # Convert data to float tensors + X_train_num, X_test_num = torch.tensor(X_train_num).float(), torch.tensor(X_test_num).float() + X_train_cat, X_test_cat = torch.tensor(X_train_cat), torch.tensor(X_test_cat) + + # Create dataset module + train_data = TabularDataset(X_train_num.float(), X_train_cat) + + # Set up device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + X_test_num = X_test_num.float().to(device) + X_test_cat = X_test_cat.to(device) + + # Create train DataLoader + train_loader = DataLoader( + train_data, + batch_size=raw_config["train"]["vae"]["batch_size"], + shuffle=True, + num_workers=raw_config["train"]["vae"]["num_dataset_workers"], + ) + + # Initialize the TabSyn model + tabsyn = TabSyn( + train_loader, + X_test_num, + X_test_cat, + num_numerical_features=d_numerical, + num_classes=categories, + device=device + ) + + # Instantiate and train the VAE model + tabsyn.instantiate_vae(**raw_config["model_params"], optim_params=raw_config["train"]["optim"]["vae"]) + + tabsyn.train_vae( + **raw_config["loss_params"], + num_epochs=raw_config["train"]["vae"]["num_epochs"], + save_path=vae_dir, + ) + + # Save VAE embeddings + tabsyn.save_vae_embeddings(X_train_num, X_train_cat, vae_ckpt_dir=vae_dir) + + # Load latent embeddings from VAE + train_z, _ = tabsyn.load_latent_embeddings(vae_dir) + + # Normalize embeddings + mean, std = train_z.mean(0), train_z.std(0) + latent_train_data = (train_z - mean) / 2 + + # Create DataLoader for latent space + latent_train_loader = DataLoader( + latent_train_data, + batch_size=raw_config["train"]["diffusion"]["batch_size"], + shuffle=True, + num_workers=raw_config["train"]["diffusion"]["num_dataset_workers"], + ) + + # Instantiate and train the Diffusion model + tabsyn.instantiate_diffusion( + in_dim=train_z.shape[1], + hid_dim=train_z.shape[1], + optim_params=raw_config["train"]["optim"]["diffusion"] + ) + + tabsyn.train_diffusion( + latent_train_loader, + num_epochs=raw_config["train"]["diffusion"]["num_epochs"], + ckpt_path=save_dir, + ) + +if __name__ == "__main__": + main()