ci: add spam issue detection scripts

Signed-off-by: Babak K. Shandiz <babakks@github.com>
This commit is contained in:
Babak K. Shandiz 2025-07-16 21:01:00 +01:00
parent dbff7c5655
commit c7c68920d8
No known key found for this signature in database
GPG key ID: 9472CAEFF56C742E
6 changed files with 5338 additions and 0 deletions

View file

@ -0,0 +1,45 @@
#!/bin/bash
# Check if an issue is spam or not and output "PASS" (not spam) or "FAIL" (spam).
#
# Regardless of the spam detection result, the script always exits with a zero
# exit code, unless there's a runtime error.
#
# This script must be run from the root directory of the repository.
set -euo pipefail
_prompt_file=".github/workflows/scripts/spam-detection/prompt.yml"
_generate_sys_prompt_script=".github/workflows/scripts/spam-detection/generate-sys-prompt.sh"
_generate_prompt_script=".github/workflows/scripts/spam-detection/generate-prompt.sh"
_issue_url="$1"
if [[ -z "$_issue_url" ]]; then
echo "error: issue URL is empty" >&2
exit 1
fi
_issue="$(gh issue view --json title,body "$_issue_url")"
_issue_body="$(jq -r ".body" <<< "$_issue")"
_issue_title="$(jq -r ".title" <<< "$_issue")"
_system_prompt="$($_generate_sys_prompt_script)"
_input_prompt="$($_generate_prompt_script "$_issue_title" "$_issue_body")"
_updated_prompt_file_content="$(
cat "$_prompt_file" |
yq eval 'del(.testData, .evaluators)' | # drop test data
_system="$_system_prompt" _input="$_input_prompt" yq eval ".messages[0].content = strenv(_system) | .messages[1].content = strenv(_input)"
)"
gh extension install github/gh-models 2>/dev/null
_result="$(gh models run --file <(echo "$_updated_prompt_file_content") | cat)"
if [[ "$_result" != "PASS" && "$_result" != "FAIL" ]]; then
echo "error: expected PASS or FAIL but got an unexpected result: $_result" >&2
exit 1
fi
echo "$_result"

View file

@ -0,0 +1,26 @@
#!/bin/bash
# Run the eval tests for the spam detection AI model.
#
# This script must be run from the root directory of the repository.
set -euo pipefail
_prompt_file=".github/workflows/scripts/spam-detection/prompt.yml"
_generate_sys_prompt_script=".github/workflows/scripts/spam-detection/generate-sys-prompt.sh"
_system_prompt="$($_generate_sys_prompt_script)"
_updated_prompt_file="$(_value="$_system_prompt" yq eval '.messages[0].content = strenv(_value)' "$_prompt_file")"
# We should be able to just run the following command:
#
# ```
# gh models eval <(echo "$_updated_prompt_file")
# ```
#
# But since `gh-models` does not throttle the rate of API requests, we need to
# modify the extension code and introduce a deliberate delay between the runs.
# Here, we assume a binary of the `gh-models` extension (with appropriate
# throttling) is available in the root directory of the repository and we're
# calling it directly (not though `gh`).
./gh-models eval <(echo "$_updated_prompt_file")

View file

@ -0,0 +1,23 @@
#!/bin/bash
# Generate the prompt for the spam detection AI model. The issue title and body
# should be provided as arguments.
#
# This script must be run from the root directory of the repository.
set -euo pipefail
_issue_title="$1"
_issue_body="$2"
_prompt="
<TITLE>
$_issue_title
</TITLE>
<BODY>
$_issue_body
</BODY>
"
echo "$_prompt"

View file

@ -0,0 +1,126 @@
#!/bin/bash
# Generate the system prompt for the spam detection AI model.
#
# This script must be run from the root directory of the repository.
set -euo pipefail
_system_prompt='
# Your role
You are a spam detection AI who helps identify spam issues submitted to the GitHub CLI repository.
Note that:
- More context about the GitHub CLI project is provided in section "Context" below.
- Criteria for spam issues are provided in section "Spam content indicators" below.
- Criteria for legitimate issues are provided in section "Legitimate content indicators" below.
With every prompt you are given the title and a body of a GitHub issue. Your task is to determine if the issue is spam
or not.
Prompts will be formatted as follows, where the title and body of an issue are surrounded by `<TITLE>` and `<BODY>` tags:
```
<TITLE>
[issue title goes here]
</TITLE>
<BODY>
[issue body goes here]
</BODY>
```
Your response must be single word `FAIL` if the issue looks like a spam, and `PASS` otherwise.
## Context
The GitHub CLI (also known as `gh`) project is a command-line tool for GitHub. It provides many commands to interact
with various GitHub features.
You can find a the general docs of the GitHub CLI tool in section "GitHub CLI docs" below, which helps you understand
the available commands and their usages.
## Legitimate content indicators
- Clear description of a bug with steps to reproduce.
- Feature requests with detailed explanations and use cases.
- Documentation improvements with specific suggestions.
- Questions about usage with context and examples.
- Reports that reference specific code, files, or functionality.
## Spam content indicators
Here are the common patterns of spam issues:
- A body that is a copy, or a small variation, of one of the issue templates defined under the "Issue templates" section below.
- When comparing with a template, you should ignore the headings and commented lines enclosed in `<!--`-`-->` tags, and
focus on the content.
- Unrelated body and title that do not provide any useful information about the issue.
- An empty issue body.
- A body that contains only a single word or a few words, such as "bug", "help", "issue", "problem".
- A meaningless body that does not provide any useful information about the issue.
- A body that is just one or more links without any context or explanation.
- Generic placeholder text like "Lorem ipsum" or "test test test".
- Repetitive content (same word/phrase repeated multiple times).
- Content that appears to be copied from other sources without relevance to the project.
- Promotional content, advertisements, or unrelated marketing material.
- Content in languages that seem inappropriate for the project context.
- Issues that don''t relate to the project''s purpose (e.g. personal messages, off-topic discussions).
- Content that seems like to be taken from, or quoting, another discussion or issue which does not not establish a sensible
context, or problem statement, or feedback.
'
# Append the help output for the root `gh` command
_system_prompt="${_system_prompt}
## GitHub CLI docs
The GitHub CLI tool has many commands, below is a piece of the help output, surrounded with \`<GitHub CLI docs>\` tags,
for the root \`gh\` command.
<GitHub CLI docs>
\`\`\`
$(gh --help)
\`\`\`
</GitHub CLI docs>
"
# Append the issue templates to the system prompt.
_system_prompt="${_system_prompt}
## Issue templates
Here are the issue templates already defined in the project. The templates are surrounded with \`<Template N>\` tags and
triple backticks, where N is the template number. The templates are provided to help you understand the common patterns
of issues.
"
_template_index=1
for template_file in .github/ISSUE_TEMPLATE/*.md; do
if ! [[ -f "$template_file" ]]; then
continue
fi
_template_content="$(cat "$template_file")"
# Remove YAML front matter (everything between the first two --- lines)
_template_content="$(echo "$_template_content" | sed '1,/^---$/d; /^---$/,$d')"
_escaped_template="$(sed -e 's/^```/\\```/g' <<< "$_template_content" )"
_system_prompt="${_system_prompt}
<Template ${_template_index}>
\`\`\`
${_escaped_template}
\`\`\`
</Template ${_template_index}>
"
((_template_index++))
done
echo "$_system_prompt"

View file

@ -0,0 +1,31 @@
#!/bin/bash
# Performs spam detection on an issue and labels if if it's spam.
#
# Regardless of the spam detection result, the script always exits with a zero
# exit code, unless there's a runtime error.
#
# This script must be run from the root directory of the repository.
set -euo pipefail
_issue_url="$1"
if [[ -z "$_issue_url" ]]; then
echo "error: issue URL is empty" >&2
exit 1
fi
_suspected_spam_label="suspected-spam"
_check_issue_script=".github/workflows/scripts/spam-detection/check-issue.sh"
_result="$($_check_issue_script "$_issue_url")"
if [[ "$_result" == "PASS" ]]; then
exit 0
fi
echo "spam issue detected: $_issue_url"
gh issue edit --add-label "$_suspected_spam_label" "$_issue_url"
echo "issue labelled as suspected spam"

File diff suppressed because one or more lines are too long