Merge pull request #11316 from cli/babakks/automate-spam-issue-detection

Automate spam issue detection
2025-07-21 17:49:12 -06:00 · 2025-07-21 17:49:12 -06:00 · b2348f8386
commit b2348f8386
parent 42a8e0265c aa955e1fe6
7 changed files with 5348 additions and 0 deletions
--- a/.github/workflows/detect-spam.yml
+++ b/.github/workflows/detect-spam.yml
@ -0,0 +1,27 @@
+name: Spam Issue Detection
+on:
+  issues:
+    types: [opened]
+
+permissions:
+  contents: none
+  issues: write
+  models: read
+
+jobs:
+  issue-spam:
+    runs-on: ubuntu-latest
+    environment: cli-automation
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Run spam detection
+        env:
+          GH_TOKEN: ${{ secrets.AUTOMATION_TOKEN }}
+          ISSUE_URL: ${{ github.event.issue.html_url }}
+        run: |
+          ./.github/workflows/scripts/spam-detection/process-issue.sh "$ISSUE_URL"
+          if [[ $? -ne 0 ]]; then
+            echo "error processing issue"
+            exit 1
+          fi
--- a/.github/workflows/scripts/spam-detection/check-issue-prompts.yml
+++ b/.github/workflows/scripts/spam-detection/check-issue-prompts.yml
@ -0,0 +1,7 @@
+name: Detect spam
+model: openai/gpt-4o-mini
+messages:
+  - role: system
+    content: "" # Since it's not a fix value, it should be generated and replaced at runtime
+  - role: user
+    content: "" # This will be replaced at runtime
--- a/.github/workflows/scripts/spam-detection/check-issue.sh
+++ b/.github/workflows/scripts/spam-detection/check-issue.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Check if an issue is spam or not and output "PASS" (not spam) or "FAIL" (spam).
+#
+# Regardless of the spam detection result, the script always exits with a zero
+# exit code, unless there's a runtime error.
+#
+# This script must be run from the root directory of the repository.
+
+set -euo pipefail
+
+# Determine absolute path to script directory based on where it is called from.
+# This allows the script to be run from any directory.
+SPAM_DIR="$(dirname "$(realpath "$0")")"
+
+# Retrieve and prepare information about issue for detection
+_issue_url="$1"
+if [[ -z "$_issue_url" ]]; then
+    echo "error: issue URL is empty" >&2
+    exit 1
+fi
+
+_user_prompt_template='
+<TITLE>
+{{ .title }}
+</TITLE>
+
+<BODY>
+{{ .body }}
+</BODY>
+'
+
+_user_prompt="$(gh issue view --json title,body --template "$_user_prompt_template" "$_issue_url")"
+
+# Generate dynamic prompts for inference
+_system_prompt="$($SPAM_DIR/generate-sys-prompt.sh)"
+_final_prompt="$(_system="$_system_prompt" _user="$_user_prompt" yq eval ".messages[0].content = strenv(_system) | .messages[1].content = strenv(_user)" "$SPAM_DIR/check-issue-prompts.yml")"
+
+gh extension install github/gh-models 2>/dev/null
+
+_result="$(gh models run --file <(echo "$_final_prompt") | cat)"
+
+if [[ "$_result" != "PASS" && "$_result" != "FAIL" ]]; then
+    echo "error: expected PASS or FAIL but got an unexpected result: $_result" >&2
+    exit 1
+fi
+
+echo "$_result"
--- a/.github/workflows/scripts/spam-detection/eval-prompts.yml
+++ b/.github/workflows/scripts/spam-detection/eval-prompts.yml
--- a/.github/workflows/scripts/spam-detection/eval.sh
+++ b/.github/workflows/scripts/spam-detection/eval.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Run the eval tests for the spam detection AI model.
+#
+# This script must be run from the root directory of the repository.
+
+set -euo pipefail
+
+# Determine absolute path to script directory based on where it is called from.
+# This allows the script to be run from any directory.
+SPAM_DIR="$(dirname "$(realpath "$0")")"
+
+# Generate dynamic prompts for inference
+_system_prompt="$($SPAM_DIR/generate-sys-prompt.sh)"
+_final_prompt="$(_value="$_system_prompt" yq eval '.messages[0].content = strenv(_value)' $SPAM_DIR/eval-prompts.yml)"
+
+# The following `gh models eval` command will fail after 20 requests due to rate limits.
+# We are going to open up an issue in `github/gh-models` to address this.
+#
+# TODO: break up `eval-prompts.yml` file into smaller batches to avoid hitting the rate limit.
+gh models eval <(echo "$_final_prompt")
--- a/.github/workflows/scripts/spam-detection/generate-sys-prompt.sh
+++ b/.github/workflows/scripts/spam-detection/generate-sys-prompt.sh
@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Generate the system prompt for the spam detection AI model.
+#
+# This script must be run from the root directory of the repository.
+
+set -euo pipefail
+
+_system_prompt='
+# Your role
+
+You are a spam detection AI who helps identify spam issues submitted to the GitHub CLI repository.
+
+Note that:
+- More context about the GitHub CLI project is provided in section "Context" below.
+- Criteria for spam issues are provided in section "Spam content indicators" below.
+- Criteria for legitimate issues are provided in section "Legitimate content indicators" below.
+
+With every prompt you are given the title and a body of a GitHub issue. Your task is to determine if the issue is spam
+or not.
+
+Prompts will be formatted as follows, where the title and body of an issue are surrounded by `<TITLE>` and `<BODY>` tags:
+
+```
+<TITLE>
+[issue title goes here]
+</TITLE>
+
+<BODY>
+[issue body goes here]
+</BODY>
+```
+
+Your response must be single word `FAIL` if the issue looks like a spam, and `PASS` otherwise.
+
+## Context
+
+The GitHub CLI (also known as `gh`) project is a command-line tool for GitHub. It provides many commands to interact
+with various GitHub features.
+
+You can find the GitHub CLI tool documentation in the "GitHub CLI docs" section below, which helps you understand
+the available commands and their usages.
+
+## Legitimate content indicators
+
+- Clear description of a bug with steps to reproduce.
+- Feature requests with detailed explanations and use cases.
+- Documentation improvements with specific suggestions.
+- Questions about usage with context and examples.
+- Reports that reference specific code, files, or functionality.
+
+## Spam content indicators
+
+Here are the common patterns of spam issues:
+
+- A body that is a copy, or a small variation, of one of the issue templates defined under the "Issue templates" section below.
+  - When comparing with a template, you should ignore the headings and commented lines enclosed in `<!--`-`-->` tags, and
+    focus on the content.
+- Unrelated body and title that do not provide any useful information about the issue.
+- An empty issue body.
+- A body that contains only a single word or a few words, such as "bug", "help", "issue", "problem".
+- A meaningless body that does not provide any useful information about the issue.
+- A body that is just one or more links without any context or explanation.
+- Generic placeholder text like "Lorem ipsum" or "test test test".
+- Repetitive content (same word/phrase repeated multiple times).
+- Content that appears to be copied from other sources without relevance to the project.
+- Promotional content, advertisements, or unrelated marketing material.
+- Content in languages that seem inappropriate for the project context.
+- Issues that don''t relate to the project''s purpose (e.g. personal messages, off-topic discussions).
+- Content that seems like to be taken from, or quoting, another discussion or issue which does not establish a sensible
+  context, or problem statement, or feedback.
+
+'
+
+# Append the help output for the root `gh` command
+_system_prompt="${_system_prompt}
+
+## GitHub CLI docs
+
+The GitHub CLI tool has many commands, below is a piece of the help output, surrounded with \`<GitHub CLI docs>\` tags,
+for the root \`gh\` command.
+
+<GitHub CLI docs>
+\`\`\`
+$(gh --help)
+\`\`\`
+</GitHub CLI docs>
+"
+
+# Append the issue templates to the system prompt.
+_system_prompt="${_system_prompt}
+
+## Issue templates
+
+Here are the issue templates already defined in the project. The templates are surrounded with \`<Template N>\` tags and
+triple backticks, where N is the template number. The templates are provided to help you understand the common patterns
+of issues.
+
+"
+
+_template_index=1
+for template_file in .github/ISSUE_TEMPLATE/*.md; do
+    if ! [[ -f "$template_file" ]]; then
+        continue
+    fi
+
+    _template_content="$(cat "$template_file")"
+
+    # Remove YAML front matter (everything between the first two --- lines)
+    _template_content="$(echo "$_template_content" | sed '/^---$/,/^---$/d')"
+    _escaped_template="$(sed -e 's/^```/\\```/g' <<< "$_template_content" )"
+
+    _system_prompt="${_system_prompt}
+
+<Template ${_template_index}>
+
+\`\`\`
+${_escaped_template}
+\`\`\`
+</Template ${_template_index}>
+"
+
+    ((_template_index++))
+done
+
+echo "$_system_prompt"
--- a/.github/workflows/scripts/spam-detection/process-issue.sh
+++ b/.github/workflows/scripts/spam-detection/process-issue.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Performs spam detection on an issue and labels it if it's spam.
+#
+# Regardless of the spam detection result, the script always exits with a zero
+# exit code, unless there's a runtime error.
+#
+# This script must be run from the root directory of the repository.
+
+set -euo pipefail
+
+_issue_url="$1"
+if [[ -z "$_issue_url" ]]; then
+    echo "error: issue URL is empty" >&2
+    exit 1
+fi
+
+_suspected_spam_label="suspected-spam"
+_check_issue_script=".github/workflows/scripts/spam-detection/check-issue.sh"
+
+_result="$($_check_issue_script "$_issue_url")"
+
+if [[ "$_result" == "PASS" ]]; then
+    echo "detected as not-spam: $_issue_url"
+    exit 0
+fi
+
+echo "detected as spam: $_issue_url"
+
+gh issue edit --add-label "$_suspected_spam_label" "$_issue_url"
+
+echo "issue labelled as suspected spam"