Merge pull request #11316 from cli/babakks/automate-spam-issue-detection
Automate spam issue detection
This commit is contained in:
commit
b2348f8386
7 changed files with 5348 additions and 0 deletions
27
.github/workflows/detect-spam.yml
vendored
Normal file
27
.github/workflows/detect-spam.yml
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
name: Spam Issue Detection
|
||||
on:
|
||||
issues:
|
||||
types: [opened]
|
||||
|
||||
permissions:
|
||||
contents: none
|
||||
issues: write
|
||||
models: read
|
||||
|
||||
jobs:
|
||||
issue-spam:
|
||||
runs-on: ubuntu-latest
|
||||
environment: cli-automation
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Run spam detection
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.AUTOMATION_TOKEN }}
|
||||
ISSUE_URL: ${{ github.event.issue.html_url }}
|
||||
run: |
|
||||
./.github/workflows/scripts/spam-detection/process-issue.sh "$ISSUE_URL"
|
||||
if [[ $? -ne 0 ]]; then
|
||||
echo "error processing issue"
|
||||
exit 1
|
||||
fi
|
||||
7
.github/workflows/scripts/spam-detection/check-issue-prompts.yml
vendored
Normal file
7
.github/workflows/scripts/spam-detection/check-issue-prompts.yml
vendored
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
name: Detect spam
|
||||
model: openai/gpt-4o-mini
|
||||
messages:
|
||||
- role: system
|
||||
content: "" # Since it's not a fix value, it should be generated and replaced at runtime
|
||||
- role: user
|
||||
content: "" # This will be replaced at runtime
|
||||
48
.github/workflows/scripts/spam-detection/check-issue.sh
vendored
Executable file
48
.github/workflows/scripts/spam-detection/check-issue.sh
vendored
Executable file
|
|
@ -0,0 +1,48 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Check if an issue is spam or not and output "PASS" (not spam) or "FAIL" (spam).
|
||||
#
|
||||
# Regardless of the spam detection result, the script always exits with a zero
|
||||
# exit code, unless there's a runtime error.
|
||||
#
|
||||
# This script must be run from the root directory of the repository.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Determine absolute path to script directory based on where it is called from.
|
||||
# This allows the script to be run from any directory.
|
||||
SPAM_DIR="$(dirname "$(realpath "$0")")"
|
||||
|
||||
# Retrieve and prepare information about issue for detection
|
||||
_issue_url="$1"
|
||||
if [[ -z "$_issue_url" ]]; then
|
||||
echo "error: issue URL is empty" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
_user_prompt_template='
|
||||
<TITLE>
|
||||
{{ .title }}
|
||||
</TITLE>
|
||||
|
||||
<BODY>
|
||||
{{ .body }}
|
||||
</BODY>
|
||||
'
|
||||
|
||||
_user_prompt="$(gh issue view --json title,body --template "$_user_prompt_template" "$_issue_url")"
|
||||
|
||||
# Generate dynamic prompts for inference
|
||||
_system_prompt="$($SPAM_DIR/generate-sys-prompt.sh)"
|
||||
_final_prompt="$(_system="$_system_prompt" _user="$_user_prompt" yq eval ".messages[0].content = strenv(_system) | .messages[1].content = strenv(_user)" "$SPAM_DIR/check-issue-prompts.yml")"
|
||||
|
||||
gh extension install github/gh-models 2>/dev/null
|
||||
|
||||
_result="$(gh models run --file <(echo "$_final_prompt") | cat)"
|
||||
|
||||
if [[ "$_result" != "PASS" && "$_result" != "FAIL" ]]; then
|
||||
echo "error: expected PASS or FAIL but got an unexpected result: $_result" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$_result"
|
||||
5087
.github/workflows/scripts/spam-detection/eval-prompts.yml
vendored
Normal file
5087
.github/workflows/scripts/spam-detection/eval-prompts.yml
vendored
Normal file
File diff suppressed because one or more lines are too long
21
.github/workflows/scripts/spam-detection/eval.sh
vendored
Executable file
21
.github/workflows/scripts/spam-detection/eval.sh
vendored
Executable file
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Run the eval tests for the spam detection AI model.
|
||||
#
|
||||
# This script must be run from the root directory of the repository.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Determine absolute path to script directory based on where it is called from.
|
||||
# This allows the script to be run from any directory.
|
||||
SPAM_DIR="$(dirname "$(realpath "$0")")"
|
||||
|
||||
# Generate dynamic prompts for inference
|
||||
_system_prompt="$($SPAM_DIR/generate-sys-prompt.sh)"
|
||||
_final_prompt="$(_value="$_system_prompt" yq eval '.messages[0].content = strenv(_value)' $SPAM_DIR/eval-prompts.yml)"
|
||||
|
||||
# The following `gh models eval` command will fail after 20 requests due to rate limits.
|
||||
# We are going to open up an issue in `github/gh-models` to address this.
|
||||
#
|
||||
# TODO: break up `eval-prompts.yml` file into smaller batches to avoid hitting the rate limit.
|
||||
gh models eval <(echo "$_final_prompt")
|
||||
126
.github/workflows/scripts/spam-detection/generate-sys-prompt.sh
vendored
Executable file
126
.github/workflows/scripts/spam-detection/generate-sys-prompt.sh
vendored
Executable file
|
|
@ -0,0 +1,126 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Generate the system prompt for the spam detection AI model.
|
||||
#
|
||||
# This script must be run from the root directory of the repository.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
_system_prompt='
|
||||
# Your role
|
||||
|
||||
You are a spam detection AI who helps identify spam issues submitted to the GitHub CLI repository.
|
||||
|
||||
Note that:
|
||||
- More context about the GitHub CLI project is provided in section "Context" below.
|
||||
- Criteria for spam issues are provided in section "Spam content indicators" below.
|
||||
- Criteria for legitimate issues are provided in section "Legitimate content indicators" below.
|
||||
|
||||
With every prompt you are given the title and a body of a GitHub issue. Your task is to determine if the issue is spam
|
||||
or not.
|
||||
|
||||
Prompts will be formatted as follows, where the title and body of an issue are surrounded by `<TITLE>` and `<BODY>` tags:
|
||||
|
||||
```
|
||||
<TITLE>
|
||||
[issue title goes here]
|
||||
</TITLE>
|
||||
|
||||
<BODY>
|
||||
[issue body goes here]
|
||||
</BODY>
|
||||
```
|
||||
|
||||
Your response must be single word `FAIL` if the issue looks like a spam, and `PASS` otherwise.
|
||||
|
||||
## Context
|
||||
|
||||
The GitHub CLI (also known as `gh`) project is a command-line tool for GitHub. It provides many commands to interact
|
||||
with various GitHub features.
|
||||
|
||||
You can find the GitHub CLI tool documentation in the "GitHub CLI docs" section below, which helps you understand
|
||||
the available commands and their usages.
|
||||
|
||||
## Legitimate content indicators
|
||||
|
||||
- Clear description of a bug with steps to reproduce.
|
||||
- Feature requests with detailed explanations and use cases.
|
||||
- Documentation improvements with specific suggestions.
|
||||
- Questions about usage with context and examples.
|
||||
- Reports that reference specific code, files, or functionality.
|
||||
|
||||
## Spam content indicators
|
||||
|
||||
Here are the common patterns of spam issues:
|
||||
|
||||
- A body that is a copy, or a small variation, of one of the issue templates defined under the "Issue templates" section below.
|
||||
- When comparing with a template, you should ignore the headings and commented lines enclosed in `<!--`-`-->` tags, and
|
||||
focus on the content.
|
||||
- Unrelated body and title that do not provide any useful information about the issue.
|
||||
- An empty issue body.
|
||||
- A body that contains only a single word or a few words, such as "bug", "help", "issue", "problem".
|
||||
- A meaningless body that does not provide any useful information about the issue.
|
||||
- A body that is just one or more links without any context or explanation.
|
||||
- Generic placeholder text like "Lorem ipsum" or "test test test".
|
||||
- Repetitive content (same word/phrase repeated multiple times).
|
||||
- Content that appears to be copied from other sources without relevance to the project.
|
||||
- Promotional content, advertisements, or unrelated marketing material.
|
||||
- Content in languages that seem inappropriate for the project context.
|
||||
- Issues that don''t relate to the project''s purpose (e.g. personal messages, off-topic discussions).
|
||||
- Content that seems like to be taken from, or quoting, another discussion or issue which does not establish a sensible
|
||||
context, or problem statement, or feedback.
|
||||
|
||||
'
|
||||
|
||||
# Append the help output for the root `gh` command
|
||||
_system_prompt="${_system_prompt}
|
||||
|
||||
## GitHub CLI docs
|
||||
|
||||
The GitHub CLI tool has many commands, below is a piece of the help output, surrounded with \`<GitHub CLI docs>\` tags,
|
||||
for the root \`gh\` command.
|
||||
|
||||
<GitHub CLI docs>
|
||||
\`\`\`
|
||||
$(gh --help)
|
||||
\`\`\`
|
||||
</GitHub CLI docs>
|
||||
"
|
||||
|
||||
# Append the issue templates to the system prompt.
|
||||
_system_prompt="${_system_prompt}
|
||||
|
||||
## Issue templates
|
||||
|
||||
Here are the issue templates already defined in the project. The templates are surrounded with \`<Template N>\` tags and
|
||||
triple backticks, where N is the template number. The templates are provided to help you understand the common patterns
|
||||
of issues.
|
||||
|
||||
"
|
||||
|
||||
_template_index=1
|
||||
for template_file in .github/ISSUE_TEMPLATE/*.md; do
|
||||
if ! [[ -f "$template_file" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
_template_content="$(cat "$template_file")"
|
||||
|
||||
# Remove YAML front matter (everything between the first two --- lines)
|
||||
_template_content="$(echo "$_template_content" | sed '/^---$/,/^---$/d')"
|
||||
_escaped_template="$(sed -e 's/^```/\\```/g' <<< "$_template_content" )"
|
||||
|
||||
_system_prompt="${_system_prompt}
|
||||
|
||||
<Template ${_template_index}>
|
||||
|
||||
\`\`\`
|
||||
${_escaped_template}
|
||||
\`\`\`
|
||||
</Template ${_template_index}>
|
||||
"
|
||||
|
||||
((_template_index++))
|
||||
done
|
||||
|
||||
echo "$_system_prompt"
|
||||
32
.github/workflows/scripts/spam-detection/process-issue.sh
vendored
Executable file
32
.github/workflows/scripts/spam-detection/process-issue.sh
vendored
Executable file
|
|
@ -0,0 +1,32 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Performs spam detection on an issue and labels it if it's spam.
|
||||
#
|
||||
# Regardless of the spam detection result, the script always exits with a zero
|
||||
# exit code, unless there's a runtime error.
|
||||
#
|
||||
# This script must be run from the root directory of the repository.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
_issue_url="$1"
|
||||
if [[ -z "$_issue_url" ]]; then
|
||||
echo "error: issue URL is empty" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
_suspected_spam_label="suspected-spam"
|
||||
_check_issue_script=".github/workflows/scripts/spam-detection/check-issue.sh"
|
||||
|
||||
_result="$($_check_issue_script "$_issue_url")"
|
||||
|
||||
if [[ "$_result" == "PASS" ]]; then
|
||||
echo "detected as not-spam: $_issue_url"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "detected as spam: $_issue_url"
|
||||
|
||||
gh issue edit --add-label "$_suspected_spam_label" "$_issue_url"
|
||||
|
||||
echo "issue labelled as suspected spam"
|
||||
Loading…
Add table
Add a link
Reference in a new issue