"""Enhanced Porter-like stemmer — optimized for technical/ML vocabulary.

Minimal-overstemming design: preserves technical compound terms.
"""

import re
from typing import List


def enhanced_porter_stem(word: str) -> str:
    """Safe Porter stemmer for technical docs."""
    word = word.lower()
    if len(word) <= 3:
        return word

    # Step 1a: Plurals
    if word.endswith("sses"):
        return word[:-2]
    if word.endswith("ies") and len(word) > 4:
        return word[:-3] + "y"
    if word.endswith("es") and len(word) > 4:
        if word[-3] in "sxzh" or (len(word) > 5 and word[-4:-2] == "ch"):
            return word[:-2]
        return word[:-1]
    if word.endswith("ss"):
        return word
    if word.endswith("s") and not word.endswith(("ss", "us", "is")):
        return word[:-1]

    # Step 1b: Verb forms
    if word.endswith("ing") and len(word) > 5:
        return word[:-3]
    if word.endswith("ed") and len(word) > 4:
        return word[:-2]

    # Step 2: Common suffixes (longest-first)
    suffixes = [
        ("ational", "ate"), ("tional", "tion"), ("enci", "ence"),
        ("anci", "ance"), ("izer", "ize"), ("alli", "al"),
        ("entli", "ent"), ("eli", "e"), ("ousli", "ous"),
        ("ization", "ize"), ("ation", "ate"), ("ator", "ate"),
        ("alism", "al"), ("iveness", "ive"), ("fulness", "ful"),
        ("ousness", "ous"), ("ness", ""), ("ment", ""),
        ("ful", ""), ("less", ""), ("ive", ""), ("ous", ""),
        ("ant", ""), ("ent", ""), ("ism", ""), ("ate", ""),
        ("iti", ""), ("al", ""), ("er", ""), ("ic", ""),
        ("able", ""), ("ible", ""), ("ly", "")
    ]

    for suffix, replacement in suffixes:
        if word.endswith(suffix) and len(word) > len(suffix) + 2:
            return word[:-len(suffix)] + replacement

    return word


def tokenize_and_stem(text: str) -> List[str]:
    """Extract words >= 3 chars, lowercase, and stem."""
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    return [enhanced_porter_stem(w) for w in words]
