diff --git a/README.md b/README.md index ca22f10..47021d3 100644 --- a/README.md +++ b/README.md @@ -58,17 +58,26 @@ The dataset generation script is designed to create diverse graph instances and 5. **Filtering**: Only saves instances where the QAOA solution achieves a high approximation ratio (e.g., > 0.85) relative to the optimal solution. **Usage Arguments:** -- `--problem`: The optimization problem to solve (`MAXCUT`, `MIS`, `MAX_CLIQUE`). Default: `MAXCUT`. -- `--samples`: Target number of samples to generate. Default: `100`. -- `--nodes`: Range of nodes (format "min-max", e.g. `8-16`). Default: `8-16`. +- `--samples`: Target number of total samples to generate. Default: `100`. +- `--problem`: Single optimization problem to solve (`MAXCUT`, `MIS`, `MAX_CLIQUE`). Used only if `--mix` is not provided. Default: `MAXCUT`. +- `--nodes`: Range of nodes (format "min-max", e.g. `8-16`). Used only if `--mix` is not provided. Default: `8-16`. +- `--mix`: Allows generating multiple problem types, sizes, and proportions in a single run. Expects a list in the format `PROBLEM:PROPORTION:NODES`. - `--workers`: Number of parallel processes to use. Default: `os.cpu_count()-1`. -- `--output`: Path to save the generated JSON file. Default: `Dataset/qaoa_dataset.json`. +- `--output`: Path to save the generated JSON file. Default: `Dataset/mixed_qaoa_dataset.json`. -**Example Command:** +**Example Commands:** + +Generate a standard dataset with a single problem: ```bash python generate_v1_dataset.py --problem MIS --samples 500 --nodes 10-14 --output Dataset/train_mis.json ``` +Generate a mixed dataset containing multiple problem types: +```bash +python generate_v1_dataset.py --samples 100000 --mix MAXCUT:0.4:8-16 MIS:0.3:10-20 MAX_CLIQUE:0.3:6-12 +``` +*Note: Using `--mix` generates all instances sequentially per problem type (e.g., all MAXCUT first, then MIS, then MAX_CLIQUE) and merges them into a single coherent JSON dataset.* + ### Output Data Structure The output is a JSON file containing a list of graph instances. Each instance is a dictionary with the following fields: diff --git a/generate_v1_dataset.py b/generate_v1_dataset.py index 5fe1cf1..6ec560e 100644 --- a/generate_v1_dataset.py +++ b/generate_v1_dataset.py @@ -183,8 +183,8 @@ def cost(pv): def worker_task(args): idx, n_min, n_max, p, problem = args # Re-seed random for each process to ensure diversity - random.seed(idx + int(time.time())) - np.random.seed(idx + int(time.time())) + random.seed(idx + int(time.time() * 1000)) + np.random.seed((idx + int(time.time() * 1000)) % (2**32 - 1)) thresholds = {'MAXCUT': 0.85, 'MIS': 0.85, 'MAX_CLIQUE': 0.85} try: @@ -202,40 +202,90 @@ def worker_task(args): except: return None def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--problem", type=str, default="MAXCUT", choices=["MAXCUT", "MIS", "MAX_CLIQUE"]) - parser.add_argument("--samples", type=int, default=100) - parser.add_argument("--nodes", type=str, default="8-16") + parser = argparse.ArgumentParser(description="FastQuantum V1 Dataset Generator") + parser.add_argument("--samples", type=int, default=100, help="Total number of samples to generate") + parser.add_argument("--problem", type=str, default="MAXCUT", choices=["MAXCUT", "MIS", "MAX_CLIQUE"], help="Single problem type (if --mix is not used)") + parser.add_argument("--nodes", type=str, default="8-16", help="Node range (if --mix is not used)") + parser.add_argument("--mix", type=str, nargs="+", + help="List of mixes in format PROBLEM:PROPORTION:NODES (e.g., MAXCUT:0.4:8-16 MIS:0.3:10-20 MAX_CLIQUE:0.3:6-12)") parser.add_argument("--workers", type=int, default=os.cpu_count()-1) - parser.add_argument("--output", type=str, default="Dataset/qaoa_dataset.json") + parser.add_argument("--output", type=str, default="Dataset/mixed_qaoa_dataset.json") args = parser.parse_args() - n_min, n_max = map(int, args.nodes.split('-')) - print(f"Starting Generation | Problem: {args.problem} | Target: {args.samples}") - print(f"Nodes: {n_min}-{n_max} | Workers: {args.workers}") + if args.mix: + mix_configs = [] + total_prop = 0.0 + for m in args.mix: + parts = m.split(':') + if len(parts) != 3: + raise ValueError(f"Invalid mix format: {m}. Use PROBLEM:PROPORTION:NODES") + prob, prop, nodes = parts + prop = float(prop) + total_prop += prop + n_min, n_max = map(int, nodes.split('-')) + mix_configs.append({'problem': prob, 'proportion': prop, 'n_min': n_min, 'n_max': n_max}) + + for config in mix_configs: + config['target'] = int((config['proportion'] / total_prop) * args.samples) + + current_sum = sum(c['target'] for c in mix_configs) + if current_sum < args.samples and mix_configs: + mix_configs[0]['target'] += (args.samples - current_sum) + else: + n_min, n_max = map(int, args.nodes.split('-')) + mix_configs = [{'problem': args.problem, 'target': args.samples, 'n_min': n_min, 'n_max': n_max}] + + print(f"Starting Generation | Target Total: {args.samples} | Workers: {args.workers}") + print("Configuration:") + for c in mix_configs: + print(f" - {c['problem']}: {c['target']} samples, {c['n_min']}-{c['n_max']} nodes") - dataset, attempts = [], 0 + dataset = [] + total_attempts = 0 pool = mp.Pool(args.workers) - # Use imap_unordered for real-time progress updates - task_args = ((i, n_min, n_max, DEFAULT_P_LAYERS, args.problem) for i in range(args.samples * 100)) - try: - for result in pool.imap_unordered(worker_task, task_args): - attempts += 1 - if result: - dataset.append(result) - print(f"Progress: {len(dataset)}/{args.samples} (Attempts: {attempts})", end='\r', flush=True) + global_idx = 0 + for config in mix_configs: + target = config['target'] + if target <= 0: + continue + + problem = config['problem'] + n_min = config['n_min'] + n_max = config['n_max'] + + # Create a generator for this specific problem (we generate way more tasks but stops gracefully) + task_args = ((global_idx + i, n_min, n_max, DEFAULT_P_LAYERS, problem) for i in range(target * 100)) + + problem_dataset = [] + attempts = 0 + for result in pool.imap_unordered(worker_task, task_args): + attempts += 1 + if result: + problem_dataset.append(result) + print(f"Progress {problem}: {len(problem_dataset)}/{target} (Attempts: {attempts})", end='\r', flush=True) + + if len(problem_dataset) >= target: + break + print(f"\nFinished {problem}: {len(problem_dataset)}/{target} (Attempts: {attempts})") + dataset.extend(problem_dataset) + total_attempts += attempts + global_idx += attempts - if len(dataset) >= args.samples: - break except KeyboardInterrupt: print("\nInterrupted by user. Saving partial dataset...") pool.terminate() pool.join() - print(f"\nFinal: {len(dataset)}/{args.samples} (Total attempts: {attempts})") + # Shuffle the merged dataset to avoid ordering biases during model training + random.shuffle(dataset) + for i, sample in enumerate(dataset): + sample['id'] = i + + print(f"\nFinal Total: {len(dataset)}/{sum(c['target'] for c in mix_configs)} (Total attempts: {total_attempts})") + os.makedirs(os.path.dirname(args.output), exist_ok=True) with open(args.output, 'w') as f: json.dump(dataset, f, indent=2) print(f"Done. Saved to {args.output}")