import cv2 import torch from PIL import Image from transformers import AutoProcessor, VisionEncoderDecoderModel
from utils.utils import *
class DOLPHIN: def __init__(self, model_id_or_path): """Initialize the Hugging Face model
Args: model_id_or_path: Path to local model or Hugging Face model ID """ # Load model from local path or Hugging Face hub self.processor = AutoProcessor.from_pretrained(model_id_or_path) self.model = VisionEncoderDecoderModel.from_pretrained(model_id_or_path) self.model.eval()
# Set device and precision self.device ="cuda"iftorch.cuda.is_available()else"cpu" self.model.to(self.device) self.model = self.model.half() # Always use half precision by default
# set tokenizer self.tokenizer = self.processor.tokenizer
def chat(self, prompt, image): """rocess an image or batch of images with the given prompt(s)
Args: prompt: Text prompt or list of prompts to guide the model image: PIL Image or list of PIL Images to process
Returns: Generated text or list of texts from the model """ # Check if we're dealing with a batch is_batch = isinstance(image, list)
ifnot is_batch: # Single image, wrap it in a list for consistent processing images = [image] prompts = [prompt] else: # Batch of images images = image prompts = promptifisinstance(prompt, list)else[prompt] * len(images)
# Process output sequences = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)
# Clean prompt text from output results = [] fori, sequenceinenumerate(sequences): cleaned = sequence.replace(prompts[i],"").replace("<pad>","").replace("</s>","").strip() results.append(cleaned)
# Return a single result for single image input ifnot is_batch: returnresults[0] returnresults
def process_page(image_path, model, save_dir, max_batch_size=None): """arse document images with two stages""" # Stage 1: Page-level layout and reading order parsing pil_image = Image.open(image_path).convert("RGB") layout_output = model.chat("arse the reading order of this document.", pil_image)
# Save outputs json_path = save_outputs(recognition_results, image_path, save_dir)
returnjson_path, recognition_results
def process_elements(layout_results, padded_image, dims, model, max_batch_size=None): """arse all document elements with parallel decoding""" layout_results = parse_layout_string(layout_results)
# Store text and table elements separately text_elements = [] # Text elements table_elements = [] # Table elements figure_results = [] # Image elements (no processing needed) previous_box = None reading_order = 0
# Collect elements to process and group by type forbbox, labelinlayout_results: try: # Adjust coordinates x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = process_coordinates( bbox, padded_image, dims, previous_box )
# Crop and parse element cropped = padded_image[y1:y2, x1:x2] ifcropped.size > 0: iflabel =="fig": # For figure regions, add empty text result immediately figure_results.append( { "label": label, "bbox": [orig_x1, orig_y1, orig_x2, orig_y2], "text":"", "reading_order": reading_order, } ) else: # Prepare element for parsing pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)) element_info = { "crop": pil_crop, "label": label, "bbox": [orig_x1, orig_y1, orig_x2, orig_y2], "reading_order": reading_order, }
# Group by type iflabel =="tab": table_elements.append(element_info) else: # Text elements text_elements.append(element_info)
reading_order += 1
except Exception as e: print(f"Error processing bbox with label {label}: {str(e)}") continue
# Initialize results list recognition_results = figure_results.copy()
# Process text elements (in batches) iftext_elements: text_results = process_element_batch(text_elements, model,"Read text in the image.", max_batch_size) recognition_results.extend(text_results)
# Process table elements (in batches) iftable_elements: table_results = process_element_batch(table_elements, model,"arse the table in the image.", max_batch_size) recognition_results.extend(table_results)
# Sort elements by reading order recognition_results.sort(key=lambda x: x.get("reading_order", 0))
returnrecognition_results
def process_element_batch(elements, model, prompt, max_batch_size=None): """rocess elements of the same type in batches""" results = []
# Determine batch size batch_size = len(elements) ifmax_batch_size is not None and max_batch_size > 0: batch_size = min(batch_size, max_batch_size)
# Process in batches foriinrange(0, len(elements), batch_size): batch_elements = elements[i:i+batch_size] crops_list = [elem["crop"]foreleminbatch_elements]
# Use the same prompt for all elements in the batch prompts_list = [prompt] * len(crops_list)
def main(): parser = argparse.ArgumentParser(description="Document processing tool using DOLPHIN model") parser.add_argument("--model_path", default="./hf_model",help="ath to Hugging Face model") parser.add_argument("--input_path",type=str, default="./demo",help="ath to input image or directory of images") parser.add_argument( "--save_dir", type=str, default=None, help="Directory to save parsing results (default: same as input directory)", ) parser.add_argument( "--max_batch_size", type=int, default=16, help="Maximum number of document elements to parse in a single batch (default: 16)", ) args = parser.parse_args()
save_dir = args.save_dir or ( args.input_pathifos.path.isdir(args.input_path)elseos.path.dirname(args.input_path) ) setup_output_dirs(save_dir)
total_samples = len(image_files) print(f"\nTotal samples to process: {total_samples}")
# Process All Document Images forimage_pathinimage_files: print(f"\nProcessing {image_path}") try: json_path, recognition_results = process_page( image_path=image_path, model=model, save_dir=save_dir, max_batch_size=args.max_batch_size, )
print(f"rocessing completed. Results saved to {save_dir}")
except Exception as e: print(f"Error processing {image_path}: {str(e)}") continue
if__name__ =="__main__": main()
元素级解析
import argparse import glob import os
import torch from PIL import Image from transformers import AutoProcessor, VisionEncoderDecoderModel
from utils.utils import *
class DOLPHIN: def __init__(self, model_id_or_path): """Initialize the Hugging Face model
Args: model_id_or_path: Path to local model or Hugging Face model ID """ # Load model from local path or Hugging Face hub self.processor = AutoProcessor.from_pretrained(model_id_or_path) self.model = VisionEncoderDecoderModel.from_pretrained(model_id_or_path) self.model.eval()
# Set device and precision self.device ="cuda"iftorch.cuda.is_available()else"cpu" self.model.to(self.device) self.model = self.model.half() # Always use half precision by default
# set tokenizer self.tokenizer = self.processor.tokenizer
def chat(self, prompt, image): """rocess an image with the given prompt
Args: prompt: Text prompt to guide the model image: PIL Image to process
Returns: Generated text from the model """ # Prepare image pixel_values = self.processor(image, return_tensors="pt").pixel_values pixel_values = pixel_values.half()
# Process the output sequence = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)[0] sequence = sequence.replace(prompt,"").replace("<pad>","").replace("</s>","").strip()
returnsequence
def process_element(image_path, model, element_type, save_dir=None): """Process a single element image (text, table, formula)
Args: image_path: Path to the element image model: HFModel model instance element_type: Type of element ('text', 'table', 'formula') save_dir: Directory to save results (default: same as input directory)
Returns: Parsed content of the element and recognition results """ # Load and prepare image pil_image = Image.open(image_path).convert("RGB") pil_image = crop_margin(pil_image)
# Select appropriate prompt based on element type ifelement_type =="table": prompt ="Parse the table in the image." label ="tab" elifelement_type =="formula": prompt ="Read text in the image." label ="formula" else: # Default to text prompt ="Read text in the image." label ="text"
# Process the element result = model.chat(prompt, pil_image)
# Create recognition result in the same format as the document parser recognition_result = [ { "label": label, "text": result.strip(), } ]
# Save results if save_dir is provided ifsave_dir: save_outputs(recognition_result, image_path, save_dir) print(f"Results saved to {save_dir}")
returnresult, recognition_result
def main(): parser = argparse.ArgumentParser(description="Element-level processing using DOLPHIN model") parser.add_argument("--model_path", default="./hf_model",help="Path to Hugging Face model") parser.add_argument("--input_path",type=str, required=True,help="Path to input image or directory of images") parser.add_argument( "--element_type", type=str, choices=["text","table","formula"], default="text", help="Type of element to process (text, table, formula)", ) parser.add_argument( "--save_dir", type=str, default=None, help="Directory to save parsing results (default: same as input directory)", ) parser.add_argument("--print_results", action="store_true",help="Print recognition results to console") args = parser.parse_args()
# Load Model model = DOLPHIN(args.model_path)
# Set save directory save_dir = args.save_dir or ( args.input_pathifos.path.isdir(args.input_path)elseos.path.dirname(args.input_path) ) setup_output_dirs(save_dir)
total_samples = len(image_files) print(f"\nTotal samples to process: {total_samples}")
# Process images one by one forimage_pathinimage_files: print(f"\nProcessing {image_path}") try: result, recognition_result = process_element( image_path=image_path, model=model, element_type=args.element_type, save_dir=save_dir, )