#!/usr/bin/env python3
import argparse
import base64
import json
import os
import re
import ssl
import time
import urllib.error
import urllib.request
import urllib.parse
from pathlib import Path
from datetime import datetime

import pandas as pd

# --- Configuration ---
SCRIPT_DIR = Path(__file__).resolve().parent
API_URL_DEFAULT = "https://ai-generalizer.zeitraum.dev/api/generalize"
API_USER_DEFAULT = "dashboard"
API_PASSWORD_DEFAULT = "zeit:raum-generalizer-ai"
INPUT_FILE_DEFAULT = str(SCRIPT_DIR / "all_jobtitles.xlsx")
OUTPUT_FILE_DEFAULT = str(SCRIPT_DIR / "generalized_jobtitles.xlsx")
JOBS_FILE_DEFAULT = str(SCRIPT_DIR / "generalizer_jobs.json")
BATCH_SIZE = 500  # Number of titles per async batch
POLL_INTERVAL = 300  # 5 minutes in seconds

# --- Regex for cleaning ---
PORTAL_HINT_RE = re.compile(r"\bjobs?\b|\.jobs\b|\bjob\b", re.IGNORECASE)
DOMAIN_HINT_RE = re.compile(r"\.(jobs|de|com|net|org)\b", re.IGNORECASE)

def looks_like_portal(value: str) -> bool:
    if not value:
        return False
    text = value.strip()
    if not text:
        return False
    if PORTAL_HINT_RE.search(text):
        return True
    if DOMAIN_HINT_RE.search(text):
        return True
    return False

def strip_portal(title: str):
    if title is None:
        return None, None
    text = str(title).strip()
    if not text:
        return text, None

    dash_match = re.search(r"\s+-\s+(.+?)\s*$", text)
    if dash_match and looks_like_portal(dash_match.group(1)):
        return text[: dash_match.start()].rstrip(), dash_match.group(1).strip()

    space_match = re.search(r"\s{2,}(.+?)\s*$", text)
    if space_match and looks_like_portal(space_match.group(1)):
        return text[: space_match.start()].rstrip(), space_match.group(1).strip()

    return text, None

def parse_generalized_list(generalized):
    if not generalized:
        return []
    # If it's already a list (from JSON), return it
    if isinstance(generalized, list):
        return generalized
    # If it's a string representation of a list or comma-separated
    s = str(generalized).strip()
    if s.startswith('[') and s.endswith(']'):
        try:
            return json.loads(s)
        except:
            pass
    return [part.strip() for part in s.split(",") if part.strip()]

# --- API Client ---

def make_request(url, method="GET", data=None, username=None, password=None, insecure=False, timeout=60):
    if data:
        payload = json.dumps(data).encode("utf-8")
    else:
        payload = None

    headers = {
        "Content-Type": "application/json",
        "User-Agent": "ZRM-Generalizer-Script/2.0"
    }

    if username and password:
        auth = base64.b64encode(f"{username}:{password}".encode("utf-8")).decode("utf-8")
        headers["Authorization"] = f"Basic {auth}"

    context = ssl._create_unverified_context() if insecure else None

    req = urllib.request.Request(url, data=payload, headers=headers, method=method)
    
    try:
        with urllib.request.urlopen(req, timeout=timeout, context=context) as response:
            return json.loads(response.read().decode("utf-8"))
    except urllib.error.URLError as e:
        if isinstance(e.reason, ssl.SSLError) and "CERTIFICATE_VERIFY_FAILED" in str(e.reason) and not insecure:
            print(f"SSL Verification failed. Retrying with insecure mode automatically...")
            return make_request(url, method, data, username, password, insecure=True, timeout=timeout)
        
        if hasattr(e, 'read'):
             body = e.read().decode("utf-8")
             print(f"HTTP Error {e.code} for {url}: {body}")
        else:
             print(f"Network/SSL Error for {url}: {e}")
        raise

# --- Core Logic ---

def load_jobs_state(path):
    if os.path.exists(path):
        try:
            return json.loads(Path(path).read_text(encoding="utf-8"))
        except:
            print(f"Warning: Could not read jobs file {path}, starting fresh.")
    return {"jobs": []}

def save_jobs_state(path, state):
    Path(path).write_text(json.dumps(state, indent=2, ensure_ascii=False), encoding="utf-8")

def submit_batches(args):
    print(f"Reading input file: {args.input}")
    try:
        frame = pd.read_excel(args.input, header=None, dtype=str)
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return

    raw_titles = frame.stack().dropna().astype(str).tolist()
    if args.limit:
        raw_titles = raw_titles[:args.limit]
    
    print(f"Found {len(raw_titles)} titles. preparing to clean and dedup...")

    # Clean and dedup
    unique_cleaned = {}
    for raw in raw_titles:
        cleaned, _ = strip_portal(raw)
        if cleaned:
            unique_cleaned[cleaned] = True
    
    titles_to_send = list(unique_cleaned.keys())
    print(f"Unique cleaned titles to generalize: {len(titles_to_send)}")

    state = load_jobs_state(args.jobs_file)
    
    # Check if we already have jobs?
    # For now, we assume if called, we want to submit new ones. 
    # But maybe we should check if these titles are already covered? 
    # To keep it simple as requested: "send all titles ... to the batch converter"
    
    batches = [titles_to_send[i:i + BATCH_SIZE] for i in range(0, len(titles_to_send), BATCH_SIZE)]
    print(f"Splitting into {len(batches)} batches of {BATCH_SIZE}...")

    total_batches = len(batches)
    for i, batch in enumerate(batches):
        print(f"Submitting batch {i+1}/{total_batches} ({len(batch)} titles)...")
        
        try:
            payload = {
                "titles": [{"title": t} for t in batch],
                "mode": "async"
            }
            
            response = make_request(
                args.api_url, 
                method="POST", 
                data=payload,
                username=args.api_user,
                password=args.api_password,
                insecure=args.insecure
            )
            
            job_id = response.get("job_id")
            status_url = response.get("status_url")
            
            if job_id:
                # Construct absolute status URL if relative
                if status_url and not status_url.startswith('http'):
                    parsed = urllib.parse.urlparse(args.api_url)
                    base = f"{parsed.scheme}://{parsed.netloc}"
                    status_url = urllib.parse.urljoin(base, status_url)

                state["jobs"].append({
                    "job_id": job_id,
                    "status_url": status_url,
                    "status": "queued",
                    "created_at": datetime.now().isoformat(),
                    "batch_index": i,
                    "count": len(batch)
                })
                save_jobs_state(args.jobs_file, state)
            else:
                print(f"Error: No job_id returned for batch {i+1}")
                
            time.sleep(1) # Gentle rate limiting
            
        except Exception as e:
            print(f"Failed to submit batch {i+1}: {e}")
            # Continue or break? Let's break to avoid partial mess if network down
            break
            
    print("Submission complete.")

def process_results_to_excel(args, state):
    """
    Reads the original input, applies all results found in state/jobs, and writes output.
    """
    print("Generating output Excel file...")
    
    # 1. Aggregate all results into a lookup map
    # Map: cleaned_title -> { generalized_title: "...", titles: [...] }
    results_map = {}
    
    for job in state.get("jobs", []):
        results = job.get("results", [])
        if not results:
            continue
            
        for item in results:
            # The API returns 'original' which matches our sent 'title' (cleaned)
            original = item.get("original")
            generalized = item.get("generalized")
            
            if original:
                results_map[original] = generalized

    # 2. Process the Input Excel row by row
    try:
        frame = pd.read_excel(args.input, header=None, dtype=str)
    except Exception as e:
        print(f"Error reading input Excel: {e}")
        return

    raw_titles = frame.stack().dropna().astype(str).tolist()
    if args.limit:
        raw_titles = raw_titles[:args.limit]

    output_rows = []
    
    for raw_title in raw_titles:
        cleaned, portal = strip_portal(raw_title)
        
        generalized_title = None
        related_titles = []
        
        if cleaned and cleaned in results_map:
            generalized_val = results_map[cleaned]
            # generalized_val might be a string or complex object depending on API version?
            # Based on old script, it's a string list or string
            related_titles = parse_generalized_list(generalized_val)
            if related_titles:
                generalized_title = related_titles[0]
        
        row = {
            "original_title": raw_title,
            "cleaned_title": cleaned,
            "portal_removed": portal,
            "generalized_title": generalized_title,
        }
        
        # Add title_1 ... title_10
        for idx in range(1, 11):
            val = related_titles[idx-1] if (idx-1) < len(related_titles) else None
            row[f"title_{idx}"] = val
            
        output_rows.append(row)

    # 3. Write to Excel
    df = pd.DataFrame(output_rows)
    df.to_excel(args.output, index=False)
    print(f"Successfully wrote {len(output_rows)} rows to {args.output}")

def poll_loop(args):
    print(f"Starting polling loop (Interval: {args.poll_interval}s)...")
    
    while True:
        state = load_jobs_state(args.jobs_file)
        pending_jobs = [j for j in state["jobs"] if j["status"] not in ["completed", "failed"]]
        
        if not pending_jobs:
            print("All jobs completed!")
            process_results_to_excel(args, state)
            break
            
        print(f"Checking status for {len(pending_jobs)} pending jobs...")
        any_changed = False
        
        for job in pending_jobs:
            job_id = job["job_id"]
            url = job["status_url"]
            
            try:
                # If URL is missing, construct it (backward compatibility or safety)
                if not url:
                    url = f"{args.api_url.replace('/generalize', '')}/jobs/{job_id}/status"

                data = make_request(
                    url, 
                    username=args.api_user, 
                    password=args.api_password,
                    insecure=args.insecure
                )
                
                status = data.get("status")
                
                if status != job["status"]:
                    print(f"Job {job_id}: {job['status']} -> {status}")
                    job["status"] = status
                    any_changed = True
                
                if status == "completed":
                    # Download results immediately and save to state
                    job["results"] = data.get("results", [])
                    # Also save progress metrics if available
                    job["progress"] = data.get("progress")
                    
                elif status == "failed":
                    print(f"Job {job_id} FAILED.")
                    
            except Exception as e:
                print(f"Error checking job {job_id}: {e}")
        
        if any_changed:
            save_jobs_state(args.jobs_file, state)
            # Optional: Intermediate save to Excel?
            # process_results_to_excel(args, state) 

        print(f"Sleeping for {args.poll_interval} seconds...")
        time.sleep(args.poll_interval)

def main():
    parser = argparse.ArgumentParser(description="ZRM AI Bulk Job Title Generalizer (Async)")
    parser.add_argument("--input", default=INPUT_FILE_DEFAULT, help="Input Excel file")
    parser.add_argument("--output", default=OUTPUT_FILE_DEFAULT, help="Output Excel file")
    parser.add_argument("--jobs-file", default=JOBS_FILE_DEFAULT, help="JSON file to store job state")
    
    parser.add_argument("--api-url", default=os.getenv("ZRM_GENERALIZER_API_URL", API_URL_DEFAULT))
    parser.add_argument("--api-user", default=os.getenv("ZRM_GENERALIZER_API_USER", API_USER_DEFAULT))
    parser.add_argument("--api-password", default=os.getenv("ZRM_GENERALIZER_API_PASSWORD", API_PASSWORD_DEFAULT))
    
    parser.add_argument("--poll-interval", type=int, default=POLL_INTERVAL, help="Polling interval in seconds")
    parser.add_argument("--limit", type=int, default=None, help="Limit number of input rows (for testing)")
    parser.add_argument("--insecure", action="store_true", help="Skip SSL verification")
    parser.add_argument("--force-submit", action="store_true", help="Force resubmission even if jobs file exists")

    args = parser.parse_args()

    # Determine mode
    state = load_jobs_state(args.jobs_file)
    has_jobs = len(state.get("jobs", [])) > 0
    
    if args.force_submit or not has_jobs:
        print("--- Starting Submission Phase ---")
        # Clear existing jobs if forcing
        if args.force_submit:
            state["jobs"] = []
            save_jobs_state(args.jobs_file, state)
            
        submit_batches(args)
        
    print("--- Entering Polling Phase ---")
    poll_loop(args)

if __name__ == "__main__":
    main()