An autonomous agent that reviews GitHub Pull Requests for bugs, security vulnerabilities, and code quality issues — powered by LangGraph + Llama 3.3 70B with honest evaluation benchmarks and full observability.
A 5-node LangGraph StateGraph that takes a PR diff in and produces a structured review comment out. Every node is traced with Langfuse for full observability.
def build_review_graph() -> StateGraph:
"""Build and compile the code review LangGraph pipeline."""
graph = StateGraph(ReviewState)
graph.add_node("parse_diff", parse_diff)
graph.add_node("filter_files", filter_files)
graph.add_node("analyze_files", analyze_files)
graph.add_node("aggregate", aggregate)
graph.add_node("format_review", format_review)
graph.set_entry_point("parse_diff")
graph.add_edge("parse_diff", "filter_files")
graph.add_edge("filter_files", "analyze_files")
graph.add_edge("analyze_files", "aggregate")
graph.add_edge("aggregate", "format_review")
graph.add_edge("format_review", END)
return graph.compile()
class ReviewState(TypedDict, total=False):
"""State passed through the LangGraph review pipeline."""
pr_url: str
raw_diff: str
file_diffs: list[dict[str, str]]
filtered_files: list[dict[str, str]]
findings: list[dict[str, Any]]
summary: dict[str, Any]
formatted_review: str
errors: list[str]
Each node in the pipeline has a single responsibility. The state flows linearly from raw diff to formatted markdown, with Pydantic validation at every boundary.
Splits the unified diff into per-file chunks using a regex that matches diff --git headers.
_DIFF_HEADER_RE = re.compile(
r"^diff --git a/(.+?) b/(.+)$", re.MULTILINE
)
@observe(name="parse_diff")
def parse_diff(state: ReviewState) -> ReviewState:
raw_diff = state["raw_diff"]
file_diffs: list[dict[str, str]] = []
splits = _DIFF_HEADER_RE.split(raw_diff)
i = 1
while i + 2 < len(splits):
b_path = splits[i + 1]
chunk = splits[i + 2]
file_diffs.append({"path": b_path, "diff": chunk.strip()})
i += 3
return {"file_diffs": file_diffs}
Removes non-code files like lockfiles, images, fonts, and config from the analysis set.
SKIP_EXTENSIONS = frozenset({
".lock", ".md", ".json", ".yaml", ".yml", ".toml",
".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico",
".woff", ".woff2", ".ttf", ".eot",
".pyc", ".pyo", ".so", ".dll",
})
@observe(name="filter_files")
def filter_files(state: ReviewState) -> ReviewState:
filtered = []
for file_diff in state["file_diffs"]:
ext = PurePosixPath(file_diff["path"]).suffix.lower()
if ext not in SKIP_EXTENSIONS:
filtered.append(file_diff)
return {"filtered_files": filtered}
Sends each file to Llama 3.3 70B via Groq with a structured prompt, then extracts findings with a 3-tier JSON fallback.
def _extract_json_array(text: str) -> list[dict[str, Any]]:
"""Extract a JSON array from LLM output with fallback strategies."""
stripped = text.strip()
# Strategy 1: direct parse
if stripped.startswith("["):
try:
return json.loads(stripped)
except json.JSONDecodeError:
pass
# Strategy 2: extract from markdown code fences
fence_match = re.search(
r"```(?:json)?\s*\n?(.*?)```", stripped, re.DOTALL
)
if fence_match:
try:
return json.loads(fence_match.group(1).strip())
except json.JSONDecodeError:
pass
# Strategy 3: find any JSON array in the text
array_match = re.search(r"\[.*]", stripped, re.DOTALL)
if array_match:
try:
return json.loads(array_match.group(0))
except json.JSONDecodeError:
pass
raise json.JSONDecodeError("No JSON array found", text, 0)
LLMs don't always return clean JSON. The extraction pipeline handles three common failure modes: raw JSON arrays, JSON wrapped in markdown fences, and JSON buried in explanatory text. Each strategy falls through to the next, with Pydantic validation as a final gate.
Validates raw findings through Pydantic models and computes statistics by severity and category.
@observe(name="aggregate")
def aggregate(state: ReviewState) -> ReviewState:
raw_findings = state["findings"]
all_findings: list[ReviewFinding] = []
for item in raw_findings:
all_findings.append(ReviewFinding.model_validate(item))
stats: dict[str, int] = {}
for finding in all_findings:
sev = finding.severity.value
cat = finding.category.value
stats[sev] = stats.get(sev, 0) + 1
stats[cat] = stats.get(cat, 0) + 1
stats["total"] = len(all_findings)
summary = ReviewSummary(
findings=all_findings, stats=stats,
model_used=_MODEL_NAME,
tokens_used=partial.get("tokens_used", 0),
# ...
)
return {"summary": summary.model_dump()}
Converts the validated summary into GitHub-flavored markdown, grouped by severity with icons.
severity_order = [Severity.CRITICAL, Severity.WARNING, Severity.SUGGESTION]
severity_icons = {
Severity.CRITICAL: "🔴",
Severity.WARNING: "🟡",
Severity.SUGGESTION: "🔵",
}
for severity in severity_order:
matching = [f for f in summary.findings if f.severity == severity]
if not matching:
continue
lines.append(f"### {severity_icons[severity]} {severity.value.title()}")
for finding in matching:
lines.append(
f"- **{finding.file_path}:{finding.line_number}** "
f"[{finding.category.value}] "
f"(confidence: {finding.confidence:.0%})"
)
lines.append(f" {finding.message}")
if finding.suggested_fix:
lines.append(f" > **Fix:** {finding.suggested_fix}")
Every LLM finding is validated against a Pydantic model with strict type constraints.
class Severity(str, enum.Enum):
CRITICAL = "critical"
WARNING = "warning"
SUGGESTION = "suggestion"
class Category(str, enum.Enum):
BUG = "bug"
SECURITY = "security"
PERFORMANCE = "performance"
STYLE = "style"
LOGIC = "logic"
class ReviewFinding(BaseModel):
model_config = ConfigDict(frozen=True)
file_path: str
line_number: int
severity: Severity
category: Category
message: str
confidence: float = Field(ge=0.0, le=1.0)
suggested_fix: str | None = None
REVIEW_PROMPT_TEMPLATE = """\
You are an expert code reviewer. Analyze the following git diff for bugs, \
security vulnerabilities, performance issues, and code quality problems.
File: {file_path}
```diff
{file_diff}
```
Return your findings as a JSON array. Each element must follow this schema exactly:
[
{
"file_path": "{file_path}",
"line_number": 42,
"severity": "warning",
"category": "bug",
"message": "Description of the issue found",
"confidence": 0.85,
"suggested_fix": "How to fix the issue, or null if unclear"
}
]
Rules:
- Only report real issues, not formatting nitpicks handled by linters.
- Set confidence based on how certain you are about the issue.
- If there are no issues, return an empty array: []
- Return ONLY a valid JSON array. No explanations, no markdown wrapping.
"""
Evaluated against 10 synthetic PR diffs containing 30 known bugs across security vulnerabilities, logic errors, and common bug patterns.
| Benchmark | TP | FP | FN | Precision | Recall | F1 | Missed Finding |
|---|---|---|---|---|---|---|---|
| buffer_overflow | 2 | 0 | 1 | 100% | 66.7% | 80.0% | sprintf without size limit |
| hardcoded_secret | 3 | 1 | 2 | 75.0% | 60.0% | 66.7% | JWT secret, SMTP password |
| insecure_deser. | 3 | 1 | 2 | 75.0% | 60.0% | 66.7% | pickle.loads on backup/webhook |
| logic_error | 2 | 1 | 1 | 66.7% | 66.7% | 66.7% | OR instead of AND in eligibility |
| missing_validation | 2 | 1 | 2 | 66.7% | 50.0% | 57.1% | Path traversal, input format |
| sql_injection | 1 | 0 | 1 | 100% | 50.0% | 66.7% | f-string query in get_user |
| xss_vulnerability | 1 | 1 | 1 | 50.0% | 50.0% | 50.0% | Stored XSS without escaping |
| off_by_one | 1 | 0 | 2 | 100% | 33.3% | 50.0% | Integer division, negative index |
| null_reference | 0 | 1 | 1 | 0% | 0% | 0% | NoneType subscript on payment |
| race_condition | 0 | 1 | 2 | 0% | 0% | 0% | check_and_increment, transfer |
This section exists because honest evaluation matters more than impressive numbers.
Roughly 1 in 3 findings is noise. In a production setting, this erodes developer trust quickly. A second LLM pass to self-critique would likely reduce this.
The agent misses more than half of security vulnerabilities. For context, GPT-4 scored just 13% on the SecLLMHolmes benchmark for real-world vulnerability detection.
Concurrency bugs were completely missed. Single-file, single-pass analysis fundamentally cannot reason about shared state, timing, or thread safety.
Files are analyzed independently. The agent can't reason about call sites, type definitions, or invariants defined elsewhere in the codebase.
The agent is best understood as a first-pass reviewer that catches surface-level issues and flags areas for human attention. It does not replace SAST tools, linters, or human review. The value is in reducing the reviewer's search space.
Every pipeline run is traced with Langfuse, providing per-node execution time, token usage, cost breakdown, and error tracking.
# Every node is decorated for automatic tracing
@observe(name="parse_diff")
def parse_diff(state: ReviewState) -> ReviewState:
# ... node logic ...
_update_span(
input={"diff_length": len(raw_diff)},
output={"file_count": len(file_diffs)},
)
return {"file_diffs": file_diffs}
# LLM calls get generation-level tracing
@observe(name="analyze_single_file", as_type="generation")
def _analyze_single_file(llm, path, diff):
# ... LLM call ...
_update_generation(
model=_MODEL_NAME,
usage_details={"input": prompt_tokens, "output": completion_tokens},
cost_details={"estimated_cost_usd": cost},
)
return validated_findings, token_usage
Traces are exported as JSON and visualized on the live dashboard (Observability tab).
Zero infrastructure. The agent runs as a GitHub Action on PR events — no webhook server, no hosting required.
name: AI Code Review
on:
pull_request:
types: [opened, synchronize]
paths:
- "**.py"
- "**.js"
- "**.ts"
- "**.go"
- "**.rs"
- "**.java"
- "**.c"
- "**.cpp"
# ... 18 language extensions
permissions:
contents: read
pull-requests: write
jobs:
review:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: pip
- run: pip install -e .
- run: python scripts/run_review.py
env:
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_URL: ${{ github.event.pull_request.url }}
PR_DIFF_URL: ${{ github.event.pull_request.diff_url }}
PR_NUMBER: ${{ github.event.pull_request.number }}
REPO_FULL_NAME: ${{ github.repository }}
Each tool was chosen for a specific reason — no defaults, no hype-driven decisions.