ci: add spam issue detection scripts
Signed-off-by: Babak K. Shandiz <babakks@github.com>
This commit is contained in:
parent
dbff7c5655
commit
c7c68920d8
6 changed files with 5338 additions and 0 deletions
45
.github/workflows/scripts/spam-detection/check-issue.sh
vendored
Executable file
45
.github/workflows/scripts/spam-detection/check-issue.sh
vendored
Executable file
|
|
@ -0,0 +1,45 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Check if an issue is spam or not and output "PASS" (not spam) or "FAIL" (spam).
|
||||
#
|
||||
# Regardless of the spam detection result, the script always exits with a zero
|
||||
# exit code, unless there's a runtime error.
|
||||
#
|
||||
# This script must be run from the root directory of the repository.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
_prompt_file=".github/workflows/scripts/spam-detection/prompt.yml"
|
||||
_generate_sys_prompt_script=".github/workflows/scripts/spam-detection/generate-sys-prompt.sh"
|
||||
_generate_prompt_script=".github/workflows/scripts/spam-detection/generate-prompt.sh"
|
||||
|
||||
_issue_url="$1"
|
||||
if [[ -z "$_issue_url" ]]; then
|
||||
echo "error: issue URL is empty" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
_issue="$(gh issue view --json title,body "$_issue_url")"
|
||||
|
||||
_issue_body="$(jq -r ".body" <<< "$_issue")"
|
||||
_issue_title="$(jq -r ".title" <<< "$_issue")"
|
||||
|
||||
_system_prompt="$($_generate_sys_prompt_script)"
|
||||
_input_prompt="$($_generate_prompt_script "$_issue_title" "$_issue_body")"
|
||||
|
||||
_updated_prompt_file_content="$(
|
||||
cat "$_prompt_file" |
|
||||
yq eval 'del(.testData, .evaluators)' | # drop test data
|
||||
_system="$_system_prompt" _input="$_input_prompt" yq eval ".messages[0].content = strenv(_system) | .messages[1].content = strenv(_input)"
|
||||
)"
|
||||
|
||||
gh extension install github/gh-models 2>/dev/null
|
||||
|
||||
_result="$(gh models run --file <(echo "$_updated_prompt_file_content") | cat)"
|
||||
|
||||
if [[ "$_result" != "PASS" && "$_result" != "FAIL" ]]; then
|
||||
echo "error: expected PASS or FAIL but got an unexpected result: $_result" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$_result"
|
||||
26
.github/workflows/scripts/spam-detection/eval.sh
vendored
Executable file
26
.github/workflows/scripts/spam-detection/eval.sh
vendored
Executable file
|
|
@ -0,0 +1,26 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Run the eval tests for the spam detection AI model.
|
||||
#
|
||||
# This script must be run from the root directory of the repository.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
_prompt_file=".github/workflows/scripts/spam-detection/prompt.yml"
|
||||
_generate_sys_prompt_script=".github/workflows/scripts/spam-detection/generate-sys-prompt.sh"
|
||||
|
||||
_system_prompt="$($_generate_sys_prompt_script)"
|
||||
_updated_prompt_file="$(_value="$_system_prompt" yq eval '.messages[0].content = strenv(_value)' "$_prompt_file")"
|
||||
|
||||
# We should be able to just run the following command:
|
||||
#
|
||||
# ```
|
||||
# gh models eval <(echo "$_updated_prompt_file")
|
||||
# ```
|
||||
#
|
||||
# But since `gh-models` does not throttle the rate of API requests, we need to
|
||||
# modify the extension code and introduce a deliberate delay between the runs.
|
||||
# Here, we assume a binary of the `gh-models` extension (with appropriate
|
||||
# throttling) is available in the root directory of the repository and we're
|
||||
# calling it directly (not though `gh`).
|
||||
./gh-models eval <(echo "$_updated_prompt_file")
|
||||
23
.github/workflows/scripts/spam-detection/generate-prompt.sh
vendored
Executable file
23
.github/workflows/scripts/spam-detection/generate-prompt.sh
vendored
Executable file
|
|
@ -0,0 +1,23 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Generate the prompt for the spam detection AI model. The issue title and body
|
||||
# should be provided as arguments.
|
||||
#
|
||||
# This script must be run from the root directory of the repository.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
_issue_title="$1"
|
||||
_issue_body="$2"
|
||||
|
||||
_prompt="
|
||||
<TITLE>
|
||||
$_issue_title
|
||||
</TITLE>
|
||||
|
||||
<BODY>
|
||||
$_issue_body
|
||||
</BODY>
|
||||
"
|
||||
|
||||
echo "$_prompt"
|
||||
126
.github/workflows/scripts/spam-detection/generate-sys-prompt.sh
vendored
Executable file
126
.github/workflows/scripts/spam-detection/generate-sys-prompt.sh
vendored
Executable file
|
|
@ -0,0 +1,126 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Generate the system prompt for the spam detection AI model.
|
||||
#
|
||||
# This script must be run from the root directory of the repository.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
_system_prompt='
|
||||
# Your role
|
||||
|
||||
You are a spam detection AI who helps identify spam issues submitted to the GitHub CLI repository.
|
||||
|
||||
Note that:
|
||||
- More context about the GitHub CLI project is provided in section "Context" below.
|
||||
- Criteria for spam issues are provided in section "Spam content indicators" below.
|
||||
- Criteria for legitimate issues are provided in section "Legitimate content indicators" below.
|
||||
|
||||
With every prompt you are given the title and a body of a GitHub issue. Your task is to determine if the issue is spam
|
||||
or not.
|
||||
|
||||
Prompts will be formatted as follows, where the title and body of an issue are surrounded by `<TITLE>` and `<BODY>` tags:
|
||||
|
||||
```
|
||||
<TITLE>
|
||||
[issue title goes here]
|
||||
</TITLE>
|
||||
|
||||
<BODY>
|
||||
[issue body goes here]
|
||||
</BODY>
|
||||
```
|
||||
|
||||
Your response must be single word `FAIL` if the issue looks like a spam, and `PASS` otherwise.
|
||||
|
||||
## Context
|
||||
|
||||
The GitHub CLI (also known as `gh`) project is a command-line tool for GitHub. It provides many commands to interact
|
||||
with various GitHub features.
|
||||
|
||||
You can find a the general docs of the GitHub CLI tool in section "GitHub CLI docs" below, which helps you understand
|
||||
the available commands and their usages.
|
||||
|
||||
## Legitimate content indicators
|
||||
|
||||
- Clear description of a bug with steps to reproduce.
|
||||
- Feature requests with detailed explanations and use cases.
|
||||
- Documentation improvements with specific suggestions.
|
||||
- Questions about usage with context and examples.
|
||||
- Reports that reference specific code, files, or functionality.
|
||||
|
||||
## Spam content indicators
|
||||
|
||||
Here are the common patterns of spam issues:
|
||||
|
||||
- A body that is a copy, or a small variation, of one of the issue templates defined under the "Issue templates" section below.
|
||||
- When comparing with a template, you should ignore the headings and commented lines enclosed in `<!--`-`-->` tags, and
|
||||
focus on the content.
|
||||
- Unrelated body and title that do not provide any useful information about the issue.
|
||||
- An empty issue body.
|
||||
- A body that contains only a single word or a few words, such as "bug", "help", "issue", "problem".
|
||||
- A meaningless body that does not provide any useful information about the issue.
|
||||
- A body that is just one or more links without any context or explanation.
|
||||
- Generic placeholder text like "Lorem ipsum" or "test test test".
|
||||
- Repetitive content (same word/phrase repeated multiple times).
|
||||
- Content that appears to be copied from other sources without relevance to the project.
|
||||
- Promotional content, advertisements, or unrelated marketing material.
|
||||
- Content in languages that seem inappropriate for the project context.
|
||||
- Issues that don''t relate to the project''s purpose (e.g. personal messages, off-topic discussions).
|
||||
- Content that seems like to be taken from, or quoting, another discussion or issue which does not not establish a sensible
|
||||
context, or problem statement, or feedback.
|
||||
|
||||
'
|
||||
|
||||
# Append the help output for the root `gh` command
|
||||
_system_prompt="${_system_prompt}
|
||||
|
||||
## GitHub CLI docs
|
||||
|
||||
The GitHub CLI tool has many commands, below is a piece of the help output, surrounded with \`<GitHub CLI docs>\` tags,
|
||||
for the root \`gh\` command.
|
||||
|
||||
<GitHub CLI docs>
|
||||
\`\`\`
|
||||
$(gh --help)
|
||||
\`\`\`
|
||||
</GitHub CLI docs>
|
||||
"
|
||||
|
||||
# Append the issue templates to the system prompt.
|
||||
_system_prompt="${_system_prompt}
|
||||
|
||||
## Issue templates
|
||||
|
||||
Here are the issue templates already defined in the project. The templates are surrounded with \`<Template N>\` tags and
|
||||
triple backticks, where N is the template number. The templates are provided to help you understand the common patterns
|
||||
of issues.
|
||||
|
||||
"
|
||||
|
||||
_template_index=1
|
||||
for template_file in .github/ISSUE_TEMPLATE/*.md; do
|
||||
if ! [[ -f "$template_file" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
_template_content="$(cat "$template_file")"
|
||||
|
||||
# Remove YAML front matter (everything between the first two --- lines)
|
||||
_template_content="$(echo "$_template_content" | sed '1,/^---$/d; /^---$/,$d')"
|
||||
_escaped_template="$(sed -e 's/^```/\\```/g' <<< "$_template_content" )"
|
||||
|
||||
_system_prompt="${_system_prompt}
|
||||
|
||||
<Template ${_template_index}>
|
||||
|
||||
\`\`\`
|
||||
${_escaped_template}
|
||||
\`\`\`
|
||||
</Template ${_template_index}>
|
||||
"
|
||||
|
||||
((_template_index++))
|
||||
done
|
||||
|
||||
echo "$_system_prompt"
|
||||
31
.github/workflows/scripts/spam-detection/process-issue.sh
vendored
Executable file
31
.github/workflows/scripts/spam-detection/process-issue.sh
vendored
Executable file
|
|
@ -0,0 +1,31 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Performs spam detection on an issue and labels if if it's spam.
|
||||
#
|
||||
# Regardless of the spam detection result, the script always exits with a zero
|
||||
# exit code, unless there's a runtime error.
|
||||
#
|
||||
# This script must be run from the root directory of the repository.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
_issue_url="$1"
|
||||
if [[ -z "$_issue_url" ]]; then
|
||||
echo "error: issue URL is empty" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
_suspected_spam_label="suspected-spam"
|
||||
_check_issue_script=".github/workflows/scripts/spam-detection/check-issue.sh"
|
||||
|
||||
_result="$($_check_issue_script "$_issue_url")"
|
||||
|
||||
if [[ "$_result" == "PASS" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "spam issue detected: $_issue_url"
|
||||
|
||||
gh issue edit --add-label "$_suspected_spam_label" "$_issue_url"
|
||||
|
||||
echo "issue labelled as suspected spam"
|
||||
5087
.github/workflows/scripts/spam-detection/prompt.yml
vendored
Normal file
5087
.github/workflows/scripts/spam-detection/prompt.yml
vendored
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue