From c7c68920d81871a39584ee2642e10af64fae2572 Mon Sep 17 00:00:00 2001 From: "Babak K. Shandiz" Date: Wed, 16 Jul 2025 21:01:00 +0100 Subject: [PATCH 01/14] ci: add spam issue detection scripts Signed-off-by: Babak K. Shandiz --- .../scripts/spam-detection/check-issue.sh | 45 + .../workflows/scripts/spam-detection/eval.sh | 26 + .../scripts/spam-detection/generate-prompt.sh | 23 + .../spam-detection/generate-sys-prompt.sh | 126 + .../scripts/spam-detection/process-issue.sh | 31 + .../scripts/spam-detection/prompt.yml | 5087 +++++++++++++++++ 6 files changed, 5338 insertions(+) create mode 100755 .github/workflows/scripts/spam-detection/check-issue.sh create mode 100755 .github/workflows/scripts/spam-detection/eval.sh create mode 100755 .github/workflows/scripts/spam-detection/generate-prompt.sh create mode 100755 .github/workflows/scripts/spam-detection/generate-sys-prompt.sh create mode 100755 .github/workflows/scripts/spam-detection/process-issue.sh create mode 100644 .github/workflows/scripts/spam-detection/prompt.yml diff --git a/.github/workflows/scripts/spam-detection/check-issue.sh b/.github/workflows/scripts/spam-detection/check-issue.sh new file mode 100755 index 000000000..d2c60d8fe --- /dev/null +++ b/.github/workflows/scripts/spam-detection/check-issue.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# Check if an issue is spam or not and output "PASS" (not spam) or "FAIL" (spam). +# +# Regardless of the spam detection result, the script always exits with a zero +# exit code, unless there's a runtime error. +# +# This script must be run from the root directory of the repository. + +set -euo pipefail + +_prompt_file=".github/workflows/scripts/spam-detection/prompt.yml" +_generate_sys_prompt_script=".github/workflows/scripts/spam-detection/generate-sys-prompt.sh" +_generate_prompt_script=".github/workflows/scripts/spam-detection/generate-prompt.sh" + +_issue_url="$1" +if [[ -z "$_issue_url" ]]; then + echo "error: issue URL is empty" >&2 + exit 1 +fi + +_issue="$(gh issue view --json title,body "$_issue_url")" + +_issue_body="$(jq -r ".body" <<< "$_issue")" +_issue_title="$(jq -r ".title" <<< "$_issue")" + +_system_prompt="$($_generate_sys_prompt_script)" +_input_prompt="$($_generate_prompt_script "$_issue_title" "$_issue_body")" + +_updated_prompt_file_content="$( + cat "$_prompt_file" | + yq eval 'del(.testData, .evaluators)' | # drop test data + _system="$_system_prompt" _input="$_input_prompt" yq eval ".messages[0].content = strenv(_system) | .messages[1].content = strenv(_input)" +)" + +gh extension install github/gh-models 2>/dev/null + +_result="$(gh models run --file <(echo "$_updated_prompt_file_content") | cat)" + +if [[ "$_result" != "PASS" && "$_result" != "FAIL" ]]; then + echo "error: expected PASS or FAIL but got an unexpected result: $_result" >&2 + exit 1 +fi + +echo "$_result" diff --git a/.github/workflows/scripts/spam-detection/eval.sh b/.github/workflows/scripts/spam-detection/eval.sh new file mode 100755 index 000000000..f4a5dc0f7 --- /dev/null +++ b/.github/workflows/scripts/spam-detection/eval.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Run the eval tests for the spam detection AI model. +# +# This script must be run from the root directory of the repository. + +set -euo pipefail + +_prompt_file=".github/workflows/scripts/spam-detection/prompt.yml" +_generate_sys_prompt_script=".github/workflows/scripts/spam-detection/generate-sys-prompt.sh" + +_system_prompt="$($_generate_sys_prompt_script)" +_updated_prompt_file="$(_value="$_system_prompt" yq eval '.messages[0].content = strenv(_value)' "$_prompt_file")" + +# We should be able to just run the following command: +# +# ``` +# gh models eval <(echo "$_updated_prompt_file") +# ``` +# +# But since `gh-models` does not throttle the rate of API requests, we need to +# modify the extension code and introduce a deliberate delay between the runs. +# Here, we assume a binary of the `gh-models` extension (with appropriate +# throttling) is available in the root directory of the repository and we're +# calling it directly (not though `gh`). +./gh-models eval <(echo "$_updated_prompt_file") diff --git a/.github/workflows/scripts/spam-detection/generate-prompt.sh b/.github/workflows/scripts/spam-detection/generate-prompt.sh new file mode 100755 index 000000000..9437bf751 --- /dev/null +++ b/.github/workflows/scripts/spam-detection/generate-prompt.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Generate the prompt for the spam detection AI model. The issue title and body +# should be provided as arguments. +# +# This script must be run from the root directory of the repository. + +set -euo pipefail + +_issue_title="$1" +_issue_body="$2" + +_prompt=" + +$_issue_title + + + +$_issue_body + +" + +echo "$_prompt" diff --git a/.github/workflows/scripts/spam-detection/generate-sys-prompt.sh b/.github/workflows/scripts/spam-detection/generate-sys-prompt.sh new file mode 100755 index 000000000..76c104946 --- /dev/null +++ b/.github/workflows/scripts/spam-detection/generate-sys-prompt.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +# Generate the system prompt for the spam detection AI model. +# +# This script must be run from the root directory of the repository. + +set -euo pipefail + +_system_prompt=' +# Your role + +You are a spam detection AI who helps identify spam issues submitted to the GitHub CLI repository. + +Note that: +- More context about the GitHub CLI project is provided in section "Context" below. +- Criteria for spam issues are provided in section "Spam content indicators" below. +- Criteria for legitimate issues are provided in section "Legitimate content indicators" below. + +With every prompt you are given the title and a body of a GitHub issue. Your task is to determine if the issue is spam +or not. + +Prompts will be formatted as follows, where the title and body of an issue are surrounded by `` and `<BODY>` tags: + +``` +<TITLE> +[issue title goes here] + + + +[issue body goes here] + +``` + +Your response must be single word `FAIL` if the issue looks like a spam, and `PASS` otherwise. + +## Context + +The GitHub CLI (also known as `gh`) project is a command-line tool for GitHub. It provides many commands to interact +with various GitHub features. + +You can find a the general docs of the GitHub CLI tool in section "GitHub CLI docs" below, which helps you understand +the available commands and their usages. + +## Legitimate content indicators + +- Clear description of a bug with steps to reproduce. +- Feature requests with detailed explanations and use cases. +- Documentation improvements with specific suggestions. +- Questions about usage with context and examples. +- Reports that reference specific code, files, or functionality. + +## Spam content indicators + +Here are the common patterns of spam issues: + +- A body that is a copy, or a small variation, of one of the issue templates defined under the "Issue templates" section below. + - When comparing with a template, you should ignore the headings and commented lines enclosed in `` tags, and + focus on the content. +- Unrelated body and title that do not provide any useful information about the issue. +- An empty issue body. +- A body that contains only a single word or a few words, such as "bug", "help", "issue", "problem". +- A meaningless body that does not provide any useful information about the issue. +- A body that is just one or more links without any context or explanation. +- Generic placeholder text like "Lorem ipsum" or "test test test". +- Repetitive content (same word/phrase repeated multiple times). +- Content that appears to be copied from other sources without relevance to the project. +- Promotional content, advertisements, or unrelated marketing material. +- Content in languages that seem inappropriate for the project context. +- Issues that don''t relate to the project''s purpose (e.g. personal messages, off-topic discussions). +- Content that seems like to be taken from, or quoting, another discussion or issue which does not not establish a sensible + context, or problem statement, or feedback. + +' + +# Append the help output for the root `gh` command +_system_prompt="${_system_prompt} + +## GitHub CLI docs + +The GitHub CLI tool has many commands, below is a piece of the help output, surrounded with \`\` tags, +for the root \`gh\` command. + + +\`\`\` +$(gh --help) +\`\`\` + +" + +# Append the issue templates to the system prompt. +_system_prompt="${_system_prompt} + +## Issue templates + +Here are the issue templates already defined in the project. The templates are surrounded with \`