ci: add spam issue detection scripts

Signed-off-by: Babak K. Shandiz <babakks@github.com>
2025-07-16 21:01:00 +01:00 · 2025-07-16 21:01:00 +01:00 · c7c68920d8
commit c7c68920d8
parent dbff7c5655
6 changed files with 5338 additions and 0 deletions
--- a/.github/workflows/scripts/spam-detection/check-issue.sh
+++ b/.github/workflows/scripts/spam-detection/check-issue.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Check if an issue is spam or not and output "PASS" (not spam) or "FAIL" (spam).
+#
+# Regardless of the spam detection result, the script always exits with a zero
+# exit code, unless there's a runtime error.
+#
+# This script must be run from the root directory of the repository.
+
+set -euo pipefail
+
+_prompt_file=".github/workflows/scripts/spam-detection/prompt.yml"
+_generate_sys_prompt_script=".github/workflows/scripts/spam-detection/generate-sys-prompt.sh"
+_generate_prompt_script=".github/workflows/scripts/spam-detection/generate-prompt.sh"
+
+_issue_url="$1"
+if [[ -z "$_issue_url" ]]; then
+    echo "error: issue URL is empty" >&2
+    exit 1
+fi
+
+_issue="$(gh issue view --json title,body "$_issue_url")"
+
+_issue_body="$(jq -r ".body" <<< "$_issue")"
+_issue_title="$(jq -r ".title" <<< "$_issue")"
+
+_system_prompt="$($_generate_sys_prompt_script)"
+_input_prompt="$($_generate_prompt_script "$_issue_title" "$_issue_body")"
+
+_updated_prompt_file_content="$(
+    cat "$_prompt_file" |
+    yq eval 'del(.testData, .evaluators)' | # drop test data
+    _system="$_system_prompt" _input="$_input_prompt" yq eval ".messages[0].content = strenv(_system) | .messages[1].content = strenv(_input)"
+)"
+
+gh extension install github/gh-models 2>/dev/null
+
+_result="$(gh models run --file <(echo "$_updated_prompt_file_content") | cat)"
+
+if [[ "$_result" != "PASS" && "$_result" != "FAIL" ]]; then
+    echo "error: expected PASS or FAIL but got an unexpected result: $_result" >&2
+    exit 1
+fi
+
+echo "$_result"
--- a/.github/workflows/scripts/spam-detection/eval.sh
+++ b/.github/workflows/scripts/spam-detection/eval.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Run the eval tests for the spam detection AI model.
+#
+# This script must be run from the root directory of the repository.
+
+set -euo pipefail
+
+_prompt_file=".github/workflows/scripts/spam-detection/prompt.yml"
+_generate_sys_prompt_script=".github/workflows/scripts/spam-detection/generate-sys-prompt.sh"
+
+_system_prompt="$($_generate_sys_prompt_script)"
+_updated_prompt_file="$(_value="$_system_prompt" yq eval '.messages[0].content = strenv(_value)' "$_prompt_file")"
+
+# We should be able to just run the following command:
+#
+# ```
+# gh models eval <(echo "$_updated_prompt_file")
+# ```
+#
+# But since `gh-models` does not throttle the rate of API requests, we need to
+# modify the extension code and introduce a deliberate delay between the runs.
+# Here, we assume a binary of the `gh-models` extension (with appropriate
+# throttling) is available in the root directory of the repository and we're
+# calling it directly (not though `gh`).
+./gh-models eval <(echo "$_updated_prompt_file")
--- a/.github/workflows/scripts/spam-detection/generate-prompt.sh
+++ b/.github/workflows/scripts/spam-detection/generate-prompt.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Generate the prompt for the spam detection AI model. The issue title and body
+# should be provided as arguments.
+#
+# This script must be run from the root directory of the repository.
+
+set -euo pipefail
+
+_issue_title="$1"
+_issue_body="$2"
+
+_prompt="
+<TITLE>
+$_issue_title
+</TITLE>
+
+<BODY>
+$_issue_body
+</BODY>
+"
+
+echo "$_prompt"
--- a/.github/workflows/scripts/spam-detection/generate-sys-prompt.sh
+++ b/.github/workflows/scripts/spam-detection/generate-sys-prompt.sh
@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Generate the system prompt for the spam detection AI model.
+#
+# This script must be run from the root directory of the repository.
+
+set -euo pipefail
+
+_system_prompt='
+# Your role
+
+You are a spam detection AI who helps identify spam issues submitted to the GitHub CLI repository.
+
+Note that:
+- More context about the GitHub CLI project is provided in section "Context" below.
+- Criteria for spam issues are provided in section "Spam content indicators" below.
+- Criteria for legitimate issues are provided in section "Legitimate content indicators" below.
+
+With every prompt you are given the title and a body of a GitHub issue. Your task is to determine if the issue is spam
+or not.
+
+Prompts will be formatted as follows, where the title and body of an issue are surrounded by `<TITLE>` and `<BODY>` tags:
+
+```
+<TITLE>
+[issue title goes here]
+</TITLE>
+
+<BODY>
+[issue body goes here]
+</BODY>
+```
+
+Your response must be single word `FAIL` if the issue looks like a spam, and `PASS` otherwise.
+
+## Context
+
+The GitHub CLI (also known as `gh`) project is a command-line tool for GitHub. It provides many commands to interact
+with various GitHub features.
+
+You can find a the general docs of the GitHub CLI tool in section "GitHub CLI docs" below, which helps you understand
+the available commands and their usages.
+
+## Legitimate content indicators
+
+- Clear description of a bug with steps to reproduce.
+- Feature requests with detailed explanations and use cases.
+- Documentation improvements with specific suggestions.
+- Questions about usage with context and examples.
+- Reports that reference specific code, files, or functionality.
+
+## Spam content indicators
+
+Here are the common patterns of spam issues:
+
+- A body that is a copy, or a small variation, of one of the issue templates defined under the "Issue templates" section below.
+  - When comparing with a template, you should ignore the headings and commented lines enclosed in `<!--`-`-->` tags, and
+    focus on the content.
+- Unrelated body and title that do not provide any useful information about the issue.
+- An empty issue body.
+- A body that contains only a single word or a few words, such as "bug", "help", "issue", "problem".
+- A meaningless body that does not provide any useful information about the issue.
+- A body that is just one or more links without any context or explanation.
+- Generic placeholder text like "Lorem ipsum" or "test test test".
+- Repetitive content (same word/phrase repeated multiple times).
+- Content that appears to be copied from other sources without relevance to the project.
+- Promotional content, advertisements, or unrelated marketing material.
+- Content in languages that seem inappropriate for the project context.
+- Issues that don''t relate to the project''s purpose (e.g. personal messages, off-topic discussions).
+- Content that seems like to be taken from, or quoting, another discussion or issue which does not not establish a sensible
+  context, or problem statement, or feedback.
+
+'
+
+# Append the help output for the root `gh` command
+_system_prompt="${_system_prompt}
+
+## GitHub CLI docs
+
+The GitHub CLI tool has many commands, below is a piece of the help output, surrounded with \`<GitHub CLI docs>\` tags,
+for the root \`gh\` command.
+
+<GitHub CLI docs>
+\`\`\`
+$(gh --help)
+\`\`\`
+</GitHub CLI docs>
+"
+
+# Append the issue templates to the system prompt.
+_system_prompt="${_system_prompt}
+
+## Issue templates
+
+Here are the issue templates already defined in the project. The templates are surrounded with \`<Template N>\` tags and
+triple backticks, where N is the template number. The templates are provided to help you understand the common patterns
+of issues.
+
+"
+
+_template_index=1
+for template_file in .github/ISSUE_TEMPLATE/*.md; do
+    if ! [[ -f "$template_file" ]]; then
+        continue
+    fi
+
+    _template_content="$(cat "$template_file")"
+
+    # Remove YAML front matter (everything between the first two --- lines)
+    _template_content="$(echo "$_template_content" | sed '1,/^---$/d; /^---$/,$d')"
+    _escaped_template="$(sed -e 's/^```/\\```/g' <<< "$_template_content" )"
+
+    _system_prompt="${_system_prompt}
+
+<Template ${_template_index}>
+
+\`\`\`
+${_escaped_template}
+\`\`\`
+</Template ${_template_index}>
+"
+
+    ((_template_index++))
+done
+
+echo "$_system_prompt"
--- a/.github/workflows/scripts/spam-detection/process-issue.sh
+++ b/.github/workflows/scripts/spam-detection/process-issue.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Performs spam detection on an issue and labels if if it's spam.
+#
+# Regardless of the spam detection result, the script always exits with a zero
+# exit code, unless there's a runtime error.
+#
+# This script must be run from the root directory of the repository.
+
+set -euo pipefail
+
+_issue_url="$1"
+if [[ -z "$_issue_url" ]]; then
+    echo "error: issue URL is empty" >&2
+    exit 1
+fi
+
+_suspected_spam_label="suspected-spam"
+_check_issue_script=".github/workflows/scripts/spam-detection/check-issue.sh"
+
+_result="$($_check_issue_script "$_issue_url")"
+
+if [[ "$_result" == "PASS" ]]; then
+    exit 0
+fi
+
+echo "spam issue detected: $_issue_url"
+
+gh issue edit --add-label "$_suspected_spam_label" "$_issue_url"
+
+echo "issue labelled as suspected spam"
--- a/.github/workflows/scripts/spam-detection/prompt.yml
+++ b/.github/workflows/scripts/spam-detection/prompt.yml