KARL/gitea_bot/poller.py

import os
import time
import json
import re
from pathlib import Path
from typing import List, Optional

import requests
from gitea_client import GiteaClient
from gemini_client import GeminiClient
from dotenv import load_dotenv

# Load environment variables from parent directory .env (project root)
env_path = Path(__file__).resolve().parents[1] / ".env"
if env_path.exists():
    load_dotenv(dotenv_path=env_path)

# Configuration
API_URL = os.getenv("GITEA_API_URL")
TOKEN = os.getenv("GITEA_TOKEN")
BOT = os.getenv("BOT_USERNAME")
POLL_INTERVAL = int(os.getenv("POLL_INTERVAL", "60"))
POLL_OWNER = os.getenv("POLL_OWNER")
POLL_REPOS = os.getenv("POLL_REPOS")  # comma-separated owner/repo

ROOT = Path(__file__).resolve().parent.parent
SEEN_PATH = ROOT / ".poller_seen.json"

if not (API_URL and TOKEN and BOT):
    raise RuntimeError("GITEA_API_URL, GITEA_TOKEN and BOT_USERNAME must be set for poller")

gitea = GiteaClient(API_URL, TOKEN)
gemini = GeminiClient()


def load_seen() -> set:
    if SEEN_PATH.exists():
        try:
            with open(SEEN_PATH, "r", encoding="utf-8") as f:
                return set(tuple(x) for x in json.load(f))
        except Exception:
            return set()
    return set()


def save_seen(seen: set):
    with open(SEEN_PATH, "w", encoding="utf-8") as f:
        json.dump([list(x) for x in seen], f)


def build_prompt_from_file(file_dict: dict) -> str:
    """Build a structured prompt for reviewing a single file diff."""
    filename = file_dict.get("filename") or file_dict.get("path") or "unknown"
    patch = file_dict.get("patch") or file_dict.get("diff") or ""

    if len(patch) > 30000:
        patch = patch[:30000] + "\n...TRUNCATED..."

    prompt = (
        "You are a senior code reviewer. Analyze exactly one file diff and return ONLY JSON.\n"
        "You review C++ code with the Qt framework\n"
        "Rules:\n"
        "1) Only report real issues or actionable improvements.\n"
        "2) Use diff positions (line index in the unified diff hunk) for comment anchoring.\n"
        "3) Keep each comment short and specific.\n"
        "4) If there are no findings, return an empty findings array.\n\n"
        "JSON schema:\n"
        "{\n"
        "  \"summary\": \"short summary\",\n"
        "  \"findings\": [\n"
        "    {\n"
        "      \"diff_position\": 12,\n"
        "      \"severity\": \"high|medium|low\",\n"
        "      \"comment\": \"text\"\n"
        "    }\n"
        "  ]\n"
        "}\n\n"
        f"File: {filename}\n"
        "Unified diff:\n"
        f"{patch}"
    )
    return prompt


def extract_json_object(text: str) -> Optional[dict]:
    """Extract a JSON object from model output, including fenced JSON blocks."""
    if not text:
        return None

    raw = text.strip()
    if raw.startswith("```"):
        lines = raw.splitlines()
        if len(lines) >= 3 and lines[0].startswith("```") and lines[-1].strip() == "```":
            raw = "\n".join(lines[1:-1]).strip()
            if raw.startswith("json"):
                raw = raw[4:].strip()

    try:
        data = json.loads(raw)
        return data if isinstance(data, dict) else None
    except json.JSONDecodeError:
        pass

    start = raw.find("{")
    end = raw.rfind("}")
    if start == -1 or end == -1 or end <= start:
        return None

    candidate = raw[start:end + 1]
    try:
        data = json.loads(candidate)
        return data if isinstance(data, dict) else None
    except json.JSONDecodeError:
        return None


def parse_structured_review(ai_response: str) -> dict:
    """Parse model output into normalized review structure."""
    parsed = extract_json_object(ai_response) or {}
    summary = str(parsed.get("summary") or "No summary provided.").strip()
    findings_raw = parsed.get("findings") or []
    findings = []

    if isinstance(findings_raw, list):
        for item in findings_raw:
            if not isinstance(item, dict):
                continue

            try:
                diff_position = int(item.get("diff_position"))
            except (TypeError, ValueError):
                continue

            comment = str(item.get("comment") or "").strip()
            severity = str(item.get("severity") or "low").strip().lower()
            if not comment:
                continue

            findings.append(
                {
                    "diff_position": diff_position,
                    "severity": severity,
                    "comment": comment,
                }
            )

    return {"summary": summary, "findings": findings}


def split_unified_diff_by_file(unified_diff: str) -> dict:
    """Split a PR unified diff into per-file diff chunks keyed by new path."""
    file_diffs = {}
    current_lines: List[str] = []
    current_path: Optional[str] = None

    def flush_current() -> None:
        if current_path and current_lines:
            file_diffs[current_path] = "\n".join(current_lines).strip()

    for line in unified_diff.splitlines():
        if line.startswith("diff --git "):
            flush_current()
            current_lines = [line]
            current_path = None
            continue

        if current_lines is not None:
            current_lines.append(line)

            # Example: +++ b/src/main.cpp
            if line.startswith("+++ "):
                raw_path = line[4:].strip()
                if raw_path == "/dev/null":
                    # Deleted file; fallback to old path if needed.
                    continue
                current_path = raw_path[2:] if raw_path.startswith("b/") else raw_path

            # Fallback for rename/deletion edge cases.
            if current_path is None and line.startswith("diff --git "):
                match = re.match(r"diff --git a/(.+?) b/(.+)", line)
                if match:
                    current_path = match.group(2)

    flush_current()
    return file_diffs


def handle_assignment(owner: str, repo: str, pr: dict):
    pr_number = pr.get("number") or pr.get("index") or pr.get("id")
    try:
        files = gitea.list_pull_request_files(owner, repo, pr_number)
    except Exception as e:
        print(f"failed to fetch files for {owner}/{repo}#{pr_number}: {e}")
        return False

    if not files:
        print(f"No files found for {owner}/{repo}#{pr_number}")
        return False

    # Some Gitea setups return filenames but no patch in /pulls/{n}/files.
    fallback_patches = {}
    if files and all(not (f.get("patch") or f.get("diff") or "").strip() for f in files):
        try:
            unified_diff = gitea.get_pull_request_diff(owner, repo, pr_number)
            fallback_patches = split_unified_diff_by_file(unified_diff)
            print(
                f"Loaded fallback unified diff for {owner}/{repo}#{pr_number} "
                f"({len(fallback_patches)} file patches)"
            )
        except Exception as e:
            print(f"failed to load fallback diff for {owner}/{repo}#{pr_number}: {e}")

    # Analyze each file individually based on its diff.
    review_comments: List[dict] = []
    file_errors: List[str] = []

    for file_dict in files:
        filename = file_dict.get("filename") or file_dict.get("path")
        if not filename:
            continue

        patch = file_dict.get("patch") or file_dict.get("diff") or ""
        if not patch.strip() and fallback_patches:
            patch = fallback_patches.get(filename, "")

        if not patch.strip():
            file_errors.append(f"**{filename}**: No textual diff available.")
            continue

        file_for_prompt = dict(file_dict)
        file_for_prompt["patch"] = patch

        print(f"Analyzing {filename} for {owner}/{repo}#{pr_number}")
        prompt = build_prompt_from_file(file_for_prompt)

        try:
            ai_response = gemini.generate_review(prompt)
            parsed_review = parse_structured_review(ai_response)

            for finding in parsed_review["findings"]:
                severity = finding["severity"].upper()
                body = f"[{severity}] {finding['comment']}"
                review_comments.append({
                    "path": filename,
                    "new_position": finding["diff_position"],
                    "body": body,
                })
        except Exception as e:
            print(f"failed to generate review for {filename}: {e}")
            file_errors.append(f"**{filename}**: Error analyzing file - {e}")

    # Create one PR review containing summary + line-anchored comments.
    review_body = "### AI Code Review by [Karl der Computer](https://dev.skui.eu/SKUI/KARL)\n"
    review_body += "There are three severity levels for comments: HIGH (red) indicates critical issues, MEDIUM (orange) suggests improvements, and LOW (blue) points out minor concerns or style suggestions.\n"

    review_body += "Note: AI can make mistakes. Please review carefully.\n"
    review_body += "If there are any mistakes, please report to the [issue tracker](https://dev.skui.eu/SKUI/KARL/issues) of Karl\n"

    if file_errors:
        review_body += "\n#### Issues with file analysis:\n"
        review_body += "\n".join(file_errors)

    try:
        gitea.create_pull_request_review(
            owner, repo, pr_number,
            body=review_body,
            comments=review_comments if review_comments else None
        )
        print(f"Posted review for {owner}/{repo}#{pr_number} with {len(review_comments)} line comments")
        return True
    except Exception as e:
        print(f"failed to post review for {owner}/{repo}#{pr_number}: {e}")
        return False


def run():
    seen = load_seen()
    print("Starting poller; checking repos...")
    try:
        while True:
            repos = list(gitea.available_repositories())
            print(f"Found {len(repos)} accessible repositories")
            for owner, repo in repos:
                try:
                    prs = gitea.list_open_pull_requests(owner, repo)
                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 404:
                        # Repo exists but is not accessible (permission or deleted)
                        continue
                    print(f"failed to list PRs for {owner}/{repo}: {e}")
                    continue
                except Exception as e:
                    print(f"failed to list PRs for {owner}/{repo}: {e}")
                    continue

                for pr in prs:
                    key = (f"{owner}/{repo}", pr.get("number"))
                    reviewers = [r.get("login") or r.get("username") for r in (pr.get("requested_reviewers") or [])]
                    if BOT in reviewers and key not in seen:
                        print(f"Detected assignment: {key}")
                        ok = handle_assignment(owner, repo, pr)
                        if ok:
                            seen.add(key)
                            save_seen(seen)
            time.sleep(POLL_INTERVAL)
    except KeyboardInterrupt:
        print("Poller stopped")


if __name__ == "__main__":
    run()