feat: add whitelist char support to nonascii check (#100)
All checks were successful
submodules sync / sync (push) Successful in 4m27s
build / build (push) Successful in 6m32s
build / trigger-build-image (push) Successful in 23s

This commit brings support of whitelisted characters during repo healthcheck/non-ascii file check. Supported by an extra switch to `repo-health-checker`, `-whitelistedChars`. The argument takes a comma-separated list of non-ASCII characters and ignores them during repo healthcheck. Illegal cmdline input is logged by the logger.

Co-Authored-By: GitHub Copilot <noreply@microsoft.com>
<details>
<summary>Copilot Prompt</summary>
<br>
This is a repo for an online judge orchestrator system «JOJ3». Under `cmd/` lies a source directory for a Go command, `repo-health-checker`. You tell from its name that it checks the repo for stuff like repo size, commit message, non-ASCII character usage, etc. before sending the work to the actual judging and grading system.

Now, I want the non-ASCII character checking function of the repo health checker to be flexible - it shall accept a list of non-ASCII characters and deem them acceptable.

## Your task

- Accept this new cmdline arg. In `cmd/repo-health-checker/main.go`, accept a new command line flag `-whitelisted-chars`, which shall take exactly one string of comma-separated non-ASCII characters. This string shall be passed to the actual healthcheck package.
- Respect this list while scanning the files. In `pkg/healthcheck/nonascii.go`, function `getNonASCII()`, we utilize a bufio *Scanner* to scan through all files for non-ASCII characters. We would like the list of acceptable chars to be passed from the cmdline to here, and modify the scanner logic to actually accept the corresponding characters.
- Error handling and reporting. This command line arg, `-whitelisted-chars`, could be completely abscent; in which case, no characters shall be escaped by default. The comma-separated list passed to the command may contain ASCII characters or multiple characters that are not properly separated; in which case, ignore that element, and report the incident via the SLog logging framework used in this project.
- Test your work. Create new testcases under `examples/healthcheck/` to reflect this change. Reflect to `examples/healthcheck/asciifile/` to learn about how to configure the repo health checker. Integrate your work to the Go test framework such that it could be invoked by running `make test` at the terminal.
  - Note: Use `git init` to init your testcase directory and make a initial commit - this project, JOJ3, only runs in Git repos.

## Notes
- Directory structure. `cmd/` for invokable commands, `pkg/` for the actual logic, `internal` - something you don't need to worry about.
- JOJ3 vs. Health Check. `joj3` is a separate executable; in this session we are only working on the `repo-health-checker`.
- Extras. Make sure to read `README.md` and the directory structure before you go; also, create To-do before you execute your plan.
</details>

Reviewed-on: https://focs.ji.sjtu.edu.cn/git/JOJ/JOJ3/pulls/100
Reviewed-by: 张泊明518370910136 <bomingzh@sjtu.edu.cn>
Co-authored-by: Mack Wang <mac-wang@sjtu.edu.cn>
Co-committed-by: Mack Wang <mac-wang@sjtu.edu.cn>
This commit is contained in:
王韵晨520370910012 2026-04-26 08:23:33 +08:00 committed by manuel
parent 9d75e359af
commit 04ae1c8674
5 changed files with 56 additions and 5 deletions

4
.gitmodules vendored
View File

@ -70,3 +70,7 @@
path = examples/cpplint/simple
url = ssh://git@focs.ji.sjtu.edu.cn:2222/JOJ/JOJ3-examples.git
branch = cpplint/simple
[submodule "examples/healthcheck/whitelistedchars-success"]
path = examples/healthcheck/whitelistedchars-success
url = ssh://git@focs.ji.sjtu.edu.cn:2222/JOJ/JOJ3-examples.git
branch = healthcheck/whitelistedchars-success

View File

@ -45,6 +45,7 @@ var (
checkFileNameList string
checkFileSumList string
metaFile []string
whitelistedChars string
allowedDomainList string
actorCsvPath string
showVersion *bool
@ -57,6 +58,7 @@ func init() {
flag.Float64Var(&repoSize, "repoSize", 2, "maximum size of the repo in MiB")
flag.StringVar(&checkFileNameList, "checkFileNameList", "", "comma-separated list of files to check")
flag.StringVar(&checkFileSumList, "checkFileSumList", "", "comma-separated list of expected checksums")
flag.StringVar(&whitelistedChars, "whitelistedChars", "", "comma-separated list of non-ASCII characters allowed in files")
flag.StringVar(&allowedDomainList, "allowedDomainList", "sjtu.edu.cn", "comma-separated list of allowed domains for commit author email")
flag.StringVar(&actorCsvPath, "actorCsvPath", "/home/tt/.config/joj/students.csv", "path to actor csv file")
parseMultiValueFlag(&metaFile, "meta", "meta files to check")
@ -74,12 +76,14 @@ func main() {
"repoSize", repoSize,
"checkFileNameList", checkFileNameList,
"checkFileSumList", checkFileSumList,
"whitelistedChars", whitelistedChars,
"meta", metaFile,
)
res := healthcheck.All(
rootDir,
checkFileNameList,
checkFileSumList,
whitelistedChars,
allowedDomainList,
actorCsvPath,
metaFile,

@ -0,0 +1 @@
Subproject commit bb9bc06fd5753e7338e9b3230b2fc3e3ce971a05

View File

@ -12,7 +12,8 @@ type Result struct {
}
func All(
rootDir, checkFileNameList, checkFileSumList, allowedDomainList, actorCsvPath string,
rootDir, checkFileNameList, checkFileSumList, whitelistedChars,
allowedDomainList, actorCsvPath string,
metaFile []string, repoSize float64,
) (res Result) {
var err error
@ -44,7 +45,7 @@ func All(
} else {
res.Msg += "### Meta File Check Passed\n"
}
err = NonASCIIFiles(rootDir)
err = NonASCIIFiles(rootDir, whitelistedChars)
if err != nil {
res.Msg += fmt.Sprintf("### Non-ASCII Characters File Check Failed:\n%s\n", err.Error())
res.Failed = true

View File

@ -8,11 +8,47 @@ import (
"path/filepath"
"strings"
"unicode"
"unicode/utf8"
"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing/format/gitattributes"
)
// Read the list of comma-separated allowed characters from command line and convert it to a hashmap.
func parseWhitelistedChars(csv string) map[rune]struct{} {
whitelist := make(map[rune]struct{})
if strings.TrimSpace(csv) == "" {
return whitelist
}
for _, raw := range strings.Split(csv, ",") {
elem := strings.TrimSpace(raw)
if elem == "" {
slog.Warn("ignoring invalid whitelisted-chars element", "element", raw, "reason", "empty element")
continue
}
if utf8.RuneCountInString(elem) != 1 {
slog.Warn("ignoring invalid whitelisted-chars element", "element", elem, "reason", "element must be exactly one character")
continue
}
ch, _ := utf8.DecodeRuneInString(elem)
if ch == utf8.RuneError {
slog.Warn("ignoring invalid whitelisted-chars element", "element", elem, "reason", "invalid utf-8 rune")
continue
}
if ch <= unicode.MaxASCII {
slog.Warn("ignoring invalid whitelisted-chars element", "element", elem, "reason", "ASCII characters are not allowed")
continue
}
whitelist[ch] = struct{}{}
}
return whitelist
}
// getSubmodulePathsFromGoGit uses the go-git library to open the repository
// at the given root path and retrieve a list of all submodule paths.
// It returns a set of submodule paths for efficient lookup.
@ -48,7 +84,7 @@ func getSubmodulePathsFromGoGit(root string) (map[string]struct{}, error) {
// getNonASCII retrieves a list of files in the specified root directory that contain non-ASCII characters.
// It searches for non-ASCII characters in each file's content and returns a list of paths to files containing non-ASCII characters.
func getNonASCII(root string) ([]string, error) {
func getNonASCII(root string, whitelist map[rune]struct{}) ([]string, error) {
var nonASCII []string
gitattrExist := true
var matcher gitattributes.Matcher
@ -113,6 +149,9 @@ func getNonASCII(root string) ([]string, error) {
for scanner.Scan() {
cont := true
for _, c := range scanner.Text() {
if _, ok := whitelist[c]; ok {
continue
}
if c > unicode.MaxASCII {
nonASCII = append(nonASCII, "\t"+path)
cont = false
@ -132,8 +171,10 @@ func getNonASCII(root string) ([]string, error) {
// NonASCIIFiles checks for non-ASCII characters in files within the specified root directory.
// It prints a message with the paths to files containing non-ASCII characters, if any.
func NonASCIIFiles(root string) error {
nonASCII, err := getNonASCII(root)
// Additionally it accept a list of whitelisted characters that are allowed, repo-wide.
func NonASCIIFiles(root, whitelistedChars string) error {
whitelist := parseWhitelistedChars(whitelistedChars)
nonASCII, err := getNonASCII(root, whitelist)
if err != nil {
slog.Error("getting non-ascii", "err", err)
return fmt.Errorf("error getting non-ascii: %w", err)