import json import os import argparse def find_table_boundaries(lines): """ Finds the overall boundaries of the table structure from LINE entities. It assumes the lowest lines form the header. Returns: dict: A dictionary with min/max coordinates for X and Y. """ if not lines: return None, None x_coords = [] y_coords = [] for line in lines: x_coords.extend([line['start'][0], line['end'][0]]) y_coords.extend([line['start'][1], line['end'][1]]) min_x, max_x = min(x_coords), max(x_coords) # The header is at the bottom, so find the lowest Y coordinates for horizontal lines horiz_lines_y = sorted(list(set( line['start'][1] for line in lines if abs(line['start'][1] - line['end'][1]) < 0.1 ))) # Assume the header is composed of the bottom two sections if len(horiz_lines_y) < 3: print("Warning: Could not clearly identify a 2-row header structure.") # Fallback for a single-row header if len(horiz_lines_y) < 2: return None, None y_bottom = horiz_lines_y[0] y_top = horiz_lines_y[1] data_start_y = y_top # Data rows start where the header ends else: y_bottom = horiz_lines_y[0] y_middle = horiz_lines_y[1] y_top = horiz_lines_y[2] data_start_y = y_top # Data rows start above the header top line # Get vertical column dividers' absolute X coordinates vert_lines_x = sorted(list(set( round(line['start'][0], 2) for line in lines if abs(line['start'][0] - line['end'][0]) < 0.1 ))) boundaries = { "x_min": min_x, "x_max": max_x, "y_min": y_bottom, "y_max": y_top, "header_total_height": y_top - y_bottom, "data_start_y": data_start_y } # Return boundaries and the absolute X coords of vertical lines return boundaries, vert_lines_x def find_table_boundaries_from_texts(texts, lines, y_cluster_tolerance=2.0, expansion_margin=2.0, header_cluster_gap_tolerance=5.0): """ Finds table boundaries by identifying the densest group of adjacent text clusters (multi-line header), then finds the closest data row cluster (either above or below). """ if not texts: return None, None # 1. Cluster texts by their Y-coordinate to find "rows" of text. texts.sort(key=lambda t: t['insert_point'][1]) y_clusters = [] if texts: current_cluster = [texts[0]] for i in range(1, len(texts)): if abs(texts[i]['insert_point'][1] - current_cluster[-1]['insert_point'][1]) < y_cluster_tolerance: current_cluster.append(texts[i]) else: y_clusters.append(current_cluster) current_cluster = [texts[i]] y_clusters.append(current_cluster) if not y_clusters: return None, None # 2. Find the densest *group* of adjacent clusters (our multi-line header). best_header_group = [] max_density = 0 for i in range(len(y_clusters)): current_group = [y_clusters[i]] current_density = len(y_clusters[i]) # Look ahead to see if the next clusters are close enough to be part of the same header for j in range(i + 1, len(y_clusters)): # Calculate vertical gap between the last cluster in the group and the next one last_cluster_avg_y = sum(t['insert_point'][1] for t in current_group[-1]) / len(current_group[-1]) next_cluster_avg_y = sum(t['insert_point'][1] for t in y_clusters[j]) / len(y_clusters[j]) if abs(next_cluster_avg_y - last_cluster_avg_y) < header_cluster_gap_tolerance: current_group.append(y_clusters[j]) current_density += len(y_clusters[j]) else: break # The gap is too large, this block has ended if current_density > max_density: max_density = current_density best_header_group = current_group if not best_header_group: print("Warning: Could not identify a header group.") return None, None # 3. All texts within the identified header group belong to the header. all_header_texts = [text for cluster in best_header_group for text in cluster] # 4. Find the closest data row (can be above or below the header). header_indices = {y_clusters.index(cluster) for cluster in best_header_group} first_data_row_cluster = None min_dist = float('inf') for i, cluster in enumerate(y_clusters): if i not in header_indices: # It's a data row candidate. Find its distance to the header block. header_min_y = min(t['insert_point'][1] for t in all_header_texts) header_max_y = max(t['insert_point'][1] for t in all_header_texts) cluster_avg_y = sum(t['insert_point'][1] for t in cluster) / len(cluster) dist = min(abs(cluster_avg_y - header_min_y), abs(cluster_avg_y - header_max_y)) if dist < min_dist: min_dist = dist first_data_row_cluster = cluster data_start_y = None if first_data_row_cluster: data_start_y = first_data_row_cluster[0]['insert_point'][1] else: print("Warning: Could not automatically detect a data row near the header.") # 5. Define boundaries based on the multi-line header text block. min_x = min(t['insert_point'][0] for t in all_header_texts) max_x = max(t['insert_point'][0] for t in all_header_texts) min_y = min(t['insert_point'][1] for t in all_header_texts) max_y = max(t['insert_point'][1] + t['height'] for t in all_header_texts) # ... (The rest of the logic to find lines and define final bounds remains largely the same, # but it will now operate on the correct header_texts and boundaries) # Re-using the line-finding logic from the previous implementation expansion_margin = 5.0 # Increase margin slightly for complex layouts bbox_min_x, bbox_max_x = min_x - expansion_margin, max_x + expansion_margin bbox_min_y, bbox_max_y = min_y - expansion_margin, max_y + expansion_margin table_h_lines = [l for l in lines if (bbox_min_y < l['start'][1] < bbox_max_y and bbox_min_y < l['end'][1] < bbox_max_y)] table_v_lines = [l for l in lines if (bbox_min_x < l['start'][0] < bbox_max_x and bbox_min_x < l['end'][0] < bbox_max_x)] if not table_h_lines or not table_v_lines: print("Warning: Could not find enough lines near the identified text header.") return None, None final_min_y = min(l['start'][1] for l in table_h_lines) final_max_y = max(l['start'][1] for l in table_h_lines) col_x_coords = set() for line in table_v_lines: if min(line['start'][1], line['end'][1]) < final_min_y + 1 and \ max(line['start'][1], line['end'][1]) > final_max_y - 1: col_x_coords.add(round(line['start'][0], 2)) sorted_col_x = sorted(list(col_x_coords)) if not sorted_col_x: return None, None bounds = { 'y_min': final_min_y, 'y_max': final_max_y, 'x_min': sorted_col_x[0], 'x_max': sorted_col_x[-1], 'header_total_height': final_max_y - final_min_y, 'data_start_y': data_start_y } return bounds, sorted_col_x def generate_header_template(data, bounds, col_x_coords_abs): """ Generates the header part of the template from extracted entity data, including the exact line geometry. """ lines = data.get("lines", []) texts = data.get("texts", []) if not bounds: print("Could not determine table boundaries for header. Aborting.") return None table_base_x = bounds['x_min'] table_base_y = bounds['y_min'] # --- Identify texts that are within the header boundaries --- header_texts_data = [] for text in texts: text_y = text['insert_point'][1] if bounds['y_min'] <= text_y <= bounds['y_max']: rel_x = text['insert_point'][0] - table_base_x rel_y = text_y - table_base_y header_texts_data.append({ "content": text['content'], "relative_pos": [round(rel_x, 2), round(rel_y, 2)], "alignment": text.get("alignment", "BOTTOM_LEFT"), "height": text['height'], "style": text['style'], "layer": text['layer'], "color": text['color'] }) # --- Identify LINES that are within the header boundaries --- header_lines_data = [] for line in lines: start_y = line['start'][1] end_y = line['end'][1] # Check if the line is roughly within the header's Y-span if bounds['y_min'] - 0.1 <= start_y <= bounds['y_max'] + 0.1 and \ bounds['y_min'] - 0.1 <= end_y <= bounds['y_max'] + 0.1: start_rel_x = line['start'][0] - table_base_x start_rel_y = start_y - table_base_y end_rel_x = line['end'][0] - table_base_x end_rel_y = end_y - table_base_y header_lines_data.append({ "start": [round(start_rel_x, 2), round(start_rel_y, 2)], "end": [round(end_rel_x, 2), round(end_rel_y, 2)] }) # --- Build the final template structure --- col_boundaries_relative = [round(x - table_base_x, 2) for x in col_x_coords_abs] template = { "template_name": "标准物料清单-底部表头", "row_height": 8.0, "header_height": round(bounds['header_total_height'], 2), "column_boundaries": col_boundaries_relative, "header_definition": { "lines": header_lines_data, "texts": sorted(header_texts_data, key=lambda x: (x['relative_pos'][1], x['relative_pos'][0]), reverse=True) }, "column_definitions": {} } return template def generate_column_definitions(data, bounds, col_x_coords_abs, header_template): """ Analyzes the data rows to determine the pattern for each column. """ texts = data.get("texts", []) table_base_x = bounds['x_min'] # Use the header text to identify columns header_texts = header_template["header_definition"]["texts"] # Find one distinct piece of text per column from the top row of the header to name the column col_names = {} # Maps col_idx -> col_name header_texts_by_col = [[] for _ in col_x_coords_abs] for text in header_texts: text_x = text["relative_pos"][0] + table_base_x for i in range(len(col_x_coords_abs) - 1): if col_x_coords_abs[i] <= text_x < col_x_coords_abs[i+1]: header_texts_by_col[i].append(text) break # Get column names from header for i, col_texts in enumerate(header_texts_by_col): main_text = next((t for t in col_texts if t['height'] == 3.5 and 'PARTS' not in t['content']), None) if main_text: col_names[i] = main_text['content'].strip() # --- Find text patterns in the first data row --- first_data_row_y = bounds.get("data_start_y") if first_data_row_y is None: print("Warning: No data row was found in the source DXF. No column definitions will be generated.") return [] data_row_texts = [ t for t in texts if first_data_row_y < t['insert_point'][1] < first_data_row_y + 8.0 ] col_defs_list = [] for col_idx, col_name in col_names.items(): col_left_x_abs = col_x_coords_abs[col_idx] col_right_x_abs = col_x_coords_abs[col_idx+1] if col_idx + 1 < len(col_x_coords_abs) else bounds['x_max'] texts_in_col = [ t for t in data_row_texts if col_left_x_abs <= t['insert_point'][0] < col_right_x_abs ] text_defs_for_col = [] for i, text in enumerate(texts_in_col): key = "main" # Default key if len(texts_in_col) > 1: if text['height'] == 3.5: key = "chinese_name" elif text['height'] == 2.0: key = "english_name" elif text['height'] == 3.0 and i > 0: key = "specification" row_bottom_y = bounds["data_start_y"] text_defs_for_col.append({ "data_key": key, "relative_pos": [ round(text['insert_point'][0] - col_left_x_abs, 2), round(text['insert_point'][1] - row_bottom_y, 2) ], "alignment": text.get("alignment", "BOTTOM_LEFT"), "height": text['height'], "style": text['style'], "layer": text['layer'], "color": text['color'] }) col_defs_list.append({ "name": col_name, "relative_x_start": round(col_left_x_abs - table_base_x, 2), "text_definitions": text_defs_for_col }) return col_defs_list def main(): parser = argparse.ArgumentParser(description="Generate modular header and column templates from a DXF entities JSON file.") parser.add_argument("source_json", help="Path to the source JSON file (digital snapshot).") parser.add_argument("output_header_template", help="Path to write the output header_template.json.") parser.add_argument("output_columns_template", help="Path to write the output columns_template.json.") args = parser.parse_args() if not os.path.exists(args.source_json): print(f"Error: Source JSON file not found at {args.source_json}") return print(f"Reading entity data from {args.source_json}...") with open(args.source_json, 'r', encoding='utf-8') as f: entity_data = json.load(f) print("Generating templates using text-based detection...") # USE THE NEW, ROBUST FUNCTION bounds, col_x_coords_abs = find_table_boundaries_from_texts(entity_data.get("texts", []), entity_data.get("lines", [])) if not bounds or not col_x_coords_abs: print("Error: Could not determine table boundaries from the provided snapshot.") print("Attempting to fall back to the old line-based method...") bounds, col_x_coords_abs = find_table_boundaries(entity_data.get("lines", [])) if not bounds or not col_x_coords_abs: print("Fallback method also failed. Aborting.") return # 1. Generate and save the header template header_template = generate_header_template(entity_data, bounds, col_x_coords_abs) if header_template: try: with open(args.output_header_template, 'w', encoding='utf-8') as f: json.dump(header_template, f, ensure_ascii=False, indent=2) print(f"Successfully generated header template: {args.output_header_template}") except IOError as e: print(f"Error writing header template file: {e}") # 2. Generate and save the columns template column_definitions = generate_column_definitions(entity_data, bounds, col_x_coords_abs, header_template) columns_template = { "row_height": header_template.get("row_height", 8.0), # Get row_height from header or default "column_definitions": column_definitions } if column_definitions: try: with open(args.output_columns_template, 'w', encoding='utf-8') as f: json.dump(columns_template, f, ensure_ascii=False, indent=2) print(f"Successfully generated columns template: {args.output_columns_template}") except IOError as e: print(f"Error writing columns template file: {e}") if __name__ == "__main__": main()