Merge pull request #11316 from cli/babakks/automate-spam-issue-detection

Automate spam issue detection
This commit is contained in:
Kynan Ware 2025-07-21 17:49:12 -06:00 committed by GitHub
commit b2348f8386
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 5348 additions and 0 deletions

27
.github/workflows/detect-spam.yml vendored Normal file
View file

@ -0,0 +1,27 @@
name: Spam Issue Detection
on:
issues:
types: [opened]
permissions:
contents: none
issues: write
models: read
jobs:
issue-spam:
runs-on: ubuntu-latest
environment: cli-automation
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Run spam detection
env:
GH_TOKEN: ${{ secrets.AUTOMATION_TOKEN }}
ISSUE_URL: ${{ github.event.issue.html_url }}
run: |
./.github/workflows/scripts/spam-detection/process-issue.sh "$ISSUE_URL"
if [[ $? -ne 0 ]]; then
echo "error processing issue"
exit 1
fi

View file

@ -0,0 +1,7 @@
name: Detect spam
model: openai/gpt-4o-mini
messages:
- role: system
content: "" # Since it's not a fix value, it should be generated and replaced at runtime
- role: user
content: "" # This will be replaced at runtime

View file

@ -0,0 +1,48 @@
#!/bin/bash
# Check if an issue is spam or not and output "PASS" (not spam) or "FAIL" (spam).
#
# Regardless of the spam detection result, the script always exits with a zero
# exit code, unless there's a runtime error.
#
# This script must be run from the root directory of the repository.
set -euo pipefail
# Determine absolute path to script directory based on where it is called from.
# This allows the script to be run from any directory.
SPAM_DIR="$(dirname "$(realpath "$0")")"
# Retrieve and prepare information about issue for detection
_issue_url="$1"
if [[ -z "$_issue_url" ]]; then
echo "error: issue URL is empty" >&2
exit 1
fi
_user_prompt_template='
<TITLE>
{{ .title }}
</TITLE>
<BODY>
{{ .body }}
</BODY>
'
_user_prompt="$(gh issue view --json title,body --template "$_user_prompt_template" "$_issue_url")"
# Generate dynamic prompts for inference
_system_prompt="$($SPAM_DIR/generate-sys-prompt.sh)"
_final_prompt="$(_system="$_system_prompt" _user="$_user_prompt" yq eval ".messages[0].content = strenv(_system) | .messages[1].content = strenv(_user)" "$SPAM_DIR/check-issue-prompts.yml")"
gh extension install github/gh-models 2>/dev/null
_result="$(gh models run --file <(echo "$_final_prompt") | cat)"
if [[ "$_result" != "PASS" && "$_result" != "FAIL" ]]; then
echo "error: expected PASS or FAIL but got an unexpected result: $_result" >&2
exit 1
fi
echo "$_result"

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,21 @@
#!/bin/bash
# Run the eval tests for the spam detection AI model.
#
# This script must be run from the root directory of the repository.
set -euo pipefail
# Determine absolute path to script directory based on where it is called from.
# This allows the script to be run from any directory.
SPAM_DIR="$(dirname "$(realpath "$0")")"
# Generate dynamic prompts for inference
_system_prompt="$($SPAM_DIR/generate-sys-prompt.sh)"
_final_prompt="$(_value="$_system_prompt" yq eval '.messages[0].content = strenv(_value)' $SPAM_DIR/eval-prompts.yml)"
# The following `gh models eval` command will fail after 20 requests due to rate limits.
# We are going to open up an issue in `github/gh-models` to address this.
#
# TODO: break up `eval-prompts.yml` file into smaller batches to avoid hitting the rate limit.
gh models eval <(echo "$_final_prompt")

View file

@ -0,0 +1,126 @@
#!/bin/bash
# Generate the system prompt for the spam detection AI model.
#
# This script must be run from the root directory of the repository.
set -euo pipefail
_system_prompt='
# Your role
You are a spam detection AI who helps identify spam issues submitted to the GitHub CLI repository.
Note that:
- More context about the GitHub CLI project is provided in section "Context" below.
- Criteria for spam issues are provided in section "Spam content indicators" below.
- Criteria for legitimate issues are provided in section "Legitimate content indicators" below.
With every prompt you are given the title and a body of a GitHub issue. Your task is to determine if the issue is spam
or not.
Prompts will be formatted as follows, where the title and body of an issue are surrounded by `<TITLE>` and `<BODY>` tags:
```
<TITLE>
[issue title goes here]
</TITLE>
<BODY>
[issue body goes here]
</BODY>
```
Your response must be single word `FAIL` if the issue looks like a spam, and `PASS` otherwise.
## Context
The GitHub CLI (also known as `gh`) project is a command-line tool for GitHub. It provides many commands to interact
with various GitHub features.
You can find the GitHub CLI tool documentation in the "GitHub CLI docs" section below, which helps you understand
the available commands and their usages.
## Legitimate content indicators
- Clear description of a bug with steps to reproduce.
- Feature requests with detailed explanations and use cases.
- Documentation improvements with specific suggestions.
- Questions about usage with context and examples.
- Reports that reference specific code, files, or functionality.
## Spam content indicators
Here are the common patterns of spam issues:
- A body that is a copy, or a small variation, of one of the issue templates defined under the "Issue templates" section below.
- When comparing with a template, you should ignore the headings and commented lines enclosed in `<!--`-`-->` tags, and
focus on the content.
- Unrelated body and title that do not provide any useful information about the issue.
- An empty issue body.
- A body that contains only a single word or a few words, such as "bug", "help", "issue", "problem".
- A meaningless body that does not provide any useful information about the issue.
- A body that is just one or more links without any context or explanation.
- Generic placeholder text like "Lorem ipsum" or "test test test".
- Repetitive content (same word/phrase repeated multiple times).
- Content that appears to be copied from other sources without relevance to the project.
- Promotional content, advertisements, or unrelated marketing material.
- Content in languages that seem inappropriate for the project context.
- Issues that don''t relate to the project''s purpose (e.g. personal messages, off-topic discussions).
- Content that seems like to be taken from, or quoting, another discussion or issue which does not establish a sensible
context, or problem statement, or feedback.
'
# Append the help output for the root `gh` command
_system_prompt="${_system_prompt}
## GitHub CLI docs
The GitHub CLI tool has many commands, below is a piece of the help output, surrounded with \`<GitHub CLI docs>\` tags,
for the root \`gh\` command.
<GitHub CLI docs>
\`\`\`
$(gh --help)
\`\`\`
</GitHub CLI docs>
"
# Append the issue templates to the system prompt.
_system_prompt="${_system_prompt}
## Issue templates
Here are the issue templates already defined in the project. The templates are surrounded with \`<Template N>\` tags and
triple backticks, where N is the template number. The templates are provided to help you understand the common patterns
of issues.
"
_template_index=1
for template_file in .github/ISSUE_TEMPLATE/*.md; do
if ! [[ -f "$template_file" ]]; then
continue
fi
_template_content="$(cat "$template_file")"
# Remove YAML front matter (everything between the first two --- lines)
_template_content="$(echo "$_template_content" | sed '/^---$/,/^---$/d')"
_escaped_template="$(sed -e 's/^```/\\```/g' <<< "$_template_content" )"
_system_prompt="${_system_prompt}
<Template ${_template_index}>
\`\`\`
${_escaped_template}
\`\`\`
</Template ${_template_index}>
"
((_template_index++))
done
echo "$_system_prompt"

View file

@ -0,0 +1,32 @@
#!/bin/bash
# Performs spam detection on an issue and labels it if it's spam.
#
# Regardless of the spam detection result, the script always exits with a zero
# exit code, unless there's a runtime error.
#
# This script must be run from the root directory of the repository.
set -euo pipefail
_issue_url="$1"
if [[ -z "$_issue_url" ]]; then
echo "error: issue URL is empty" >&2
exit 1
fi
_suspected_spam_label="suspected-spam"
_check_issue_script=".github/workflows/scripts/spam-detection/check-issue.sh"
_result="$($_check_issue_script "$_issue_url")"
if [[ "$_result" == "PASS" ]]; then
echo "detected as not-spam: $_issue_url"
exit 0
fi
echo "detected as spam: $_issue_url"
gh issue edit --add-label "$_suspected_spam_label" "$_issue_url"
echo "issue labelled as suspected spam"