import docx2txt
import pytesseract
from pdf2image import convert_from_path
from PyPDF2 import PdfReader

def extract_text(file_path: str) -> str:
    """
    Extract text from PDF, DOCX, TXT with OCR fallback for PDFs
    """
    text = ""
    if file_path.endswith(".pdf"):
        try:
            reader = PdfReader(file_path)
            for page in reader.pages:
                text += page.extract_text() or ""
            # OCR fallback if no text found
            if not text.strip():
                images = convert_from_path(file_path)
                for img in images:
                    text += pytesseract.image_to_string(img)
        except Exception as e:
            raise RuntimeError(f"PDF extraction failed: {e}")

    elif file_path.endswith(".docx"):
        try:
            text = docx2txt.process(file_path)
        except Exception as e:
            raise RuntimeError(f"DOCX extraction failed: {e}")

    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

    else:
        raise RuntimeError("Unsupported file type")

    return text
