dxfedit/03_Python_OpenSource_DXF/generate_template_from_json.py

import json
import os
import argparse

def find_table_boundaries(lines):
    """
    Finds the overall boundaries of the table structure from LINE entities.
    It assumes the lowest lines form the header.

    Returns:
        dict: A dictionary with min/max coordinates for X and Y.
    """
    if not lines:
        return None, None

    x_coords = []
    y_coords = []
    for line in lines:
        x_coords.extend([line['start'][0], line['end'][0]])
        y_coords.extend([line['start'][1], line['end'][1]])

    min_x, max_x = min(x_coords), max(x_coords)

    # The header is at the bottom, so find the lowest Y coordinates for horizontal lines
    horiz_lines_y = sorted(list(set(
        line['start'][1]
        for line in lines
        if abs(line['start'][1] - line['end'][1]) < 0.1
    )))

    # Assume the header is composed of the bottom two sections
    if len(horiz_lines_y) < 3:
        print("Warning: Could not clearly identify a 2-row header structure.")
        # Fallback for a single-row header
        if len(horiz_lines_y) < 2:
             return None, None
        y_bottom = horiz_lines_y[0]
        y_top = horiz_lines_y[1]
        data_start_y = y_top # Data rows start where the header ends
    else:
        y_bottom = horiz_lines_y[0]
        y_middle = horiz_lines_y[1]
        y_top = horiz_lines_y[2]
        data_start_y = y_top # Data rows start above the header top line

    # Get vertical column dividers' absolute X coordinates
    vert_lines_x = sorted(list(set(
        round(line['start'][0], 2)
        for line in lines
        if abs(line['start'][0] - line['end'][0]) < 0.1
    )))

    boundaries = {
        "x_min": min_x, "x_max": max_x,
        "y_min": y_bottom, "y_max": y_top,
        "header_total_height": y_top - y_bottom,
        "data_start_y": data_start_y
    }

    # Return boundaries and the absolute X coords of vertical lines
    return boundaries, vert_lines_x

def find_table_boundaries_from_texts(texts, lines, y_cluster_tolerance=2.0, expansion_margin=2.0, header_cluster_gap_tolerance=5.0):
    """
    Finds table boundaries by identifying the densest group of adjacent text clusters (multi-line header),
    then finds the closest data row cluster (either above or below).
    """
    if not texts:
        return None, None

    # 1. Cluster texts by their Y-coordinate to find "rows" of text.
    texts.sort(key=lambda t: t['insert_point'][1])
    y_clusters = []
    if texts:
        current_cluster = [texts[0]]
        for i in range(1, len(texts)):
            if abs(texts[i]['insert_point'][1] - current_cluster[-1]['insert_point'][1]) < y_cluster_tolerance:
                current_cluster.append(texts[i])
            else:
                y_clusters.append(current_cluster)
                current_cluster = [texts[i]]
        y_clusters.append(current_cluster)

    if not y_clusters:
        return None, None

    # 2. Find the densest *group* of adjacent clusters (our multi-line header).
    best_header_group = []
    max_density = 0
    for i in range(len(y_clusters)):
        current_group = [y_clusters[i]]
        current_density = len(y_clusters[i])
        # Look ahead to see if the next clusters are close enough to be part of the same header
        for j in range(i + 1, len(y_clusters)):
            # Calculate vertical gap between the last cluster in the group and the next one
            last_cluster_avg_y = sum(t['insert_point'][1] for t in current_group[-1]) / len(current_group[-1])
            next_cluster_avg_y = sum(t['insert_point'][1] for t in y_clusters[j]) / len(y_clusters[j])
            if abs(next_cluster_avg_y - last_cluster_avg_y) < header_cluster_gap_tolerance:
                current_group.append(y_clusters[j])
                current_density += len(y_clusters[j])
            else:
                break # The gap is too large, this block has ended

        if current_density > max_density:
            max_density = current_density
            best_header_group = current_group

    if not best_header_group:
        print("Warning: Could not identify a header group.")
        return None, None

    # 3. All texts within the identified header group belong to the header.
    all_header_texts = [text for cluster in best_header_group for text in cluster]

    # 4. Find the closest data row (can be above or below the header).
    header_indices = {y_clusters.index(cluster) for cluster in best_header_group}

    first_data_row_cluster = None
    min_dist = float('inf')

    for i, cluster in enumerate(y_clusters):
        if i not in header_indices:
            # It's a data row candidate. Find its distance to the header block.
            header_min_y = min(t['insert_point'][1] for t in all_header_texts)
            header_max_y = max(t['insert_point'][1] for t in all_header_texts)
            cluster_avg_y = sum(t['insert_point'][1] for t in cluster) / len(cluster)

            dist = min(abs(cluster_avg_y - header_min_y), abs(cluster_avg_y - header_max_y))
            if dist < min_dist:
                min_dist = dist
                first_data_row_cluster = cluster

    data_start_y = None
    if first_data_row_cluster:
        data_start_y = first_data_row_cluster[0]['insert_point'][1]
    else:
        print("Warning: Could not automatically detect a data row near the header.")

    # 5. Define boundaries based on the multi-line header text block.
    min_x = min(t['insert_point'][0] for t in all_header_texts)
    max_x = max(t['insert_point'][0] for t in all_header_texts)
    min_y = min(t['insert_point'][1] for t in all_header_texts)
    max_y = max(t['insert_point'][1] + t['height'] for t in all_header_texts)

    # ... (The rest of the logic to find lines and define final bounds remains largely the same,
    # but it will now operate on the correct header_texts and boundaries)
    # Re-using the line-finding logic from the previous implementation
    expansion_margin = 5.0 # Increase margin slightly for complex layouts
    bbox_min_x, bbox_max_x = min_x - expansion_margin, max_x + expansion_margin
    bbox_min_y, bbox_max_y = min_y - expansion_margin, max_y + expansion_margin

    table_h_lines = [l for l in lines if (bbox_min_y < l['start'][1] < bbox_max_y and
                                           bbox_min_y < l['end'][1] < bbox_max_y)]
    table_v_lines = [l for l in lines if (bbox_min_x < l['start'][0] < bbox_max_x and
                                           bbox_min_x < l['end'][0] < bbox_max_x)]

    if not table_h_lines or not table_v_lines:
        print("Warning: Could not find enough lines near the identified text header.")
        return None, None

    final_min_y = min(l['start'][1] for l in table_h_lines)
    final_max_y = max(l['start'][1] for l in table_h_lines)

    col_x_coords = set()
    for line in table_v_lines:
        if min(line['start'][1], line['end'][1]) < final_min_y + 1 and \
           max(line['start'][1], line['end'][1]) > final_max_y - 1:
            col_x_coords.add(round(line['start'][0], 2))

    sorted_col_x = sorted(list(col_x_coords))

    if not sorted_col_x:
        return None, None

    bounds = {
        'y_min': final_min_y,
        'y_max': final_max_y,
        'x_min': sorted_col_x[0],
        'x_max': sorted_col_x[-1],
        'header_total_height': final_max_y - final_min_y,
        'data_start_y': data_start_y
    }

    return bounds, sorted_col_x

def generate_header_template(data, bounds, col_x_coords_abs):
    """
    Generates the header part of the template from extracted entity data,
    including the exact line geometry.
    """
    lines = data.get("lines", [])
    texts = data.get("texts", [])
    if not bounds:
        print("Could not determine table boundaries for header. Aborting.")
        return None

    table_base_x = bounds['x_min']
    table_base_y = bounds['y_min']

    # --- Identify texts that are within the header boundaries ---
    header_texts_data = []
    for text in texts:
        text_y = text['insert_point'][1]
        if bounds['y_min'] <= text_y <= bounds['y_max']:
            rel_x = text['insert_point'][0] - table_base_x
            rel_y = text_y - table_base_y
            header_texts_data.append({
                "content": text['content'],
                "relative_pos": [round(rel_x, 2), round(rel_y, 2)],
                "alignment": text.get("alignment", "BOTTOM_LEFT"),
                "height": text['height'],
                "style": text['style'],
                "layer": text['layer'],
                "color": text['color']
            })

    # --- Identify LINES that are within the header boundaries ---
    header_lines_data = []
    for line in lines:
        start_y = line['start'][1]
        end_y = line['end'][1]
        # Check if the line is roughly within the header's Y-span
        if bounds['y_min'] - 0.1 <= start_y <= bounds['y_max'] + 0.1 and \
           bounds['y_min'] - 0.1 <= end_y <= bounds['y_max'] + 0.1:

            start_rel_x = line['start'][0] - table_base_x
            start_rel_y = start_y - table_base_y
            end_rel_x = line['end'][0] - table_base_x
            end_rel_y = end_y - table_base_y

            header_lines_data.append({
                "start": [round(start_rel_x, 2), round(start_rel_y, 2)],
                "end": [round(end_rel_x, 2), round(end_rel_y, 2)]
            })


    # --- Build the final template structure ---
    col_boundaries_relative = [round(x - table_base_x, 2) for x in col_x_coords_abs]

    template = {
        "template_name": "标准物料清单-底部表头",
        "row_height": 8.0,
        "header_height": round(bounds['header_total_height'], 2),
        "column_boundaries": col_boundaries_relative,
        "header_definition": {
            "lines": header_lines_data,
            "texts": sorted(header_texts_data, key=lambda x: (x['relative_pos'][1], x['relative_pos'][0]), reverse=True)
        },
        "column_definitions": {}
    }

    return template


def generate_column_definitions(data, bounds, col_x_coords_abs, header_template):
    """
    Analyzes the data rows to determine the pattern for each column.
    """
    texts = data.get("texts", [])
    table_base_x = bounds['x_min']

    # Use the header text to identify columns
    header_texts = header_template["header_definition"]["texts"]

    # Find one distinct piece of text per column from the top row of the header to name the column
    col_names = {} # Maps col_idx -> col_name
    header_texts_by_col = [[] for _ in col_x_coords_abs]
    for text in header_texts:
        text_x = text["relative_pos"][0] + table_base_x
        for i in range(len(col_x_coords_abs) - 1):
            if col_x_coords_abs[i] <= text_x < col_x_coords_abs[i+1]:
                header_texts_by_col[i].append(text)
                break

    # Get column names from header
    for i, col_texts in enumerate(header_texts_by_col):
        main_text = next((t for t in col_texts if t['height'] == 3.5 and 'PARTS' not in t['content']), None)
        if main_text:
            col_names[i] = main_text['content'].strip()

    # --- Find text patterns in the first data row ---
    first_data_row_y = bounds.get("data_start_y")

    if first_data_row_y is None:
        print("Warning: No data row was found in the source DXF. No column definitions will be generated.")
        return []

    data_row_texts = [
        t for t in texts
        if first_data_row_y < t['insert_point'][1] < first_data_row_y + 8.0
    ]

    col_defs_list = []
    for col_idx, col_name in col_names.items():
        col_left_x_abs = col_x_coords_abs[col_idx]
        col_right_x_abs = col_x_coords_abs[col_idx+1] if col_idx + 1 < len(col_x_coords_abs) else bounds['x_max']

        texts_in_col = [
            t for t in data_row_texts
            if col_left_x_abs <= t['insert_point'][0] < col_right_x_abs
        ]

        text_defs_for_col = []
        for i, text in enumerate(texts_in_col):
            key = "main" # Default key
            if len(texts_in_col) > 1:
                if text['height'] == 3.5: key = "chinese_name"
                elif text['height'] == 2.0: key = "english_name"
                elif text['height'] == 3.0 and i > 0: key = "specification"

            row_bottom_y = bounds["data_start_y"]

            text_defs_for_col.append({
                "data_key": key,
                "relative_pos": [
                    round(text['insert_point'][0] - col_left_x_abs, 2),
                    round(text['insert_point'][1] - row_bottom_y, 2)
                ],
                "alignment": text.get("alignment", "BOTTOM_LEFT"),
                "height": text['height'],
                "style": text['style'],
                "layer": text['layer'],
                "color": text['color']
            })

        col_defs_list.append({
            "name": col_name,
            "relative_x_start": round(col_left_x_abs - table_base_x, 2),
            "text_definitions": text_defs_for_col
        })

    return col_defs_list


def main():
    parser = argparse.ArgumentParser(description="Generate modular header and column templates from a DXF entities JSON file.")
    parser.add_argument("source_json", help="Path to the source JSON file (digital snapshot).")
    parser.add_argument("output_header_template", help="Path to write the output header_template.json.")
    parser.add_argument("output_columns_template", help="Path to write the output columns_template.json.")
    args = parser.parse_args()

    if not os.path.exists(args.source_json):
        print(f"Error: Source JSON file not found at {args.source_json}")
        return

    print(f"Reading entity data from {args.source_json}...")
    with open(args.source_json, 'r', encoding='utf-8') as f:
        entity_data = json.load(f)

    print("Generating templates using text-based detection...")
    # USE THE NEW, ROBUST FUNCTION
    bounds, col_x_coords_abs = find_table_boundaries_from_texts(entity_data.get("texts", []), entity_data.get("lines", []))

    if not bounds or not col_x_coords_abs:
        print("Error: Could not determine table boundaries from the provided snapshot.")
        print("Attempting to fall back to the old line-based method...")
        bounds, col_x_coords_abs = find_table_boundaries(entity_data.get("lines", []))
        if not bounds or not col_x_coords_abs:
            print("Fallback method also failed. Aborting.")
            return

    # 1. Generate and save the header template
    header_template = generate_header_template(entity_data, bounds, col_x_coords_abs)
    if header_template:
        try:
            with open(args.output_header_template, 'w', encoding='utf-8') as f:
                json.dump(header_template, f, ensure_ascii=False, indent=2)
            print(f"Successfully generated header template: {args.output_header_template}")
        except IOError as e:
            print(f"Error writing header template file: {e}")

    # 2. Generate and save the columns template
    column_definitions = generate_column_definitions(entity_data, bounds, col_x_coords_abs, header_template)

    columns_template = {
        "row_height": header_template.get("row_height", 8.0), # Get row_height from header or default
        "column_definitions": column_definitions
    }

    if column_definitions:
        try:
            with open(args.output_columns_template, 'w', encoding='utf-8') as f:
                json.dump(columns_template, f, ensure_ascii=False, indent=2)
            print(f"Successfully generated columns template: {args.output_columns_template}")
        except IOError as e:
            print(f"Error writing columns template file: {e}")

if __name__ == "__main__":
    main()