feat: add whitelist char support to nonascii check
Some checks failed
build / build (push) Failing after 9m18s
build / trigger-build-image (push) Has been skipped
build / build (pull_request) Failing after 11m13s
build / trigger-build-image (pull_request) Has been skipped

This commit brings support of whitelisted characters during repo
healthcheck/non-ascii file check. Supported by an extra switch to
`repo-health-checker`, `-whitelistedChars`. The argument takes a
comma-separated list of non-ASCII characters and ignores them during
repo healthcheck. Illegal cmdline input is logged by the logger.
This commit is contained in:
王韵晨520370910012 2026-04-07 20:49:58 -07:00
parent 64bc267300
commit 6496435891
GPG Key ID: F28AB6AE26FFED6F
3 changed files with 51 additions and 5 deletions

View File

@ -45,6 +45,7 @@ var (
checkFileNameList string checkFileNameList string
checkFileSumList string checkFileSumList string
metaFile []string metaFile []string
whitelistedChars string
allowedDomainList string allowedDomainList string
actorCsvPath string actorCsvPath string
showVersion *bool showVersion *bool
@ -57,6 +58,7 @@ func init() {
flag.Float64Var(&repoSize, "repoSize", 2, "maximum size of the repo in MiB") flag.Float64Var(&repoSize, "repoSize", 2, "maximum size of the repo in MiB")
flag.StringVar(&checkFileNameList, "checkFileNameList", "", "comma-separated list of files to check") flag.StringVar(&checkFileNameList, "checkFileNameList", "", "comma-separated list of files to check")
flag.StringVar(&checkFileSumList, "checkFileSumList", "", "comma-separated list of expected checksums") flag.StringVar(&checkFileSumList, "checkFileSumList", "", "comma-separated list of expected checksums")
flag.StringVar(&whitelistedChars, "whitelistedChars", "", "comma-separated list of non-ASCII characters allowed in files")
flag.StringVar(&allowedDomainList, "allowedDomainList", "sjtu.edu.cn", "comma-separated list of allowed domains for commit author email") flag.StringVar(&allowedDomainList, "allowedDomainList", "sjtu.edu.cn", "comma-separated list of allowed domains for commit author email")
flag.StringVar(&actorCsvPath, "actorCsvPath", "/home/tt/.config/joj/students.csv", "path to actor csv file") flag.StringVar(&actorCsvPath, "actorCsvPath", "/home/tt/.config/joj/students.csv", "path to actor csv file")
parseMultiValueFlag(&metaFile, "meta", "meta files to check") parseMultiValueFlag(&metaFile, "meta", "meta files to check")
@ -74,12 +76,14 @@ func main() {
"repoSize", repoSize, "repoSize", repoSize,
"checkFileNameList", checkFileNameList, "checkFileNameList", checkFileNameList,
"checkFileSumList", checkFileSumList, "checkFileSumList", checkFileSumList,
"whitelistedChars", whitelistedChars,
"meta", metaFile, "meta", metaFile,
) )
res := healthcheck.All( res := healthcheck.All(
rootDir, rootDir,
checkFileNameList, checkFileNameList,
checkFileSumList, checkFileSumList,
whitelistedChars,
allowedDomainList, allowedDomainList,
actorCsvPath, actorCsvPath,
metaFile, metaFile,

View File

@ -12,7 +12,8 @@ type Result struct {
} }
func All( func All(
rootDir, checkFileNameList, checkFileSumList, allowedDomainList, actorCsvPath string, rootDir, checkFileNameList, checkFileSumList, whitelistedCharsCSV,
allowedDomainList, actorCsvPath string,
metaFile []string, repoSize float64, metaFile []string, repoSize float64,
) (res Result) { ) (res Result) {
var err error var err error
@ -44,7 +45,7 @@ func All(
} else { } else {
res.Msg += "### Meta File Check Passed\n" res.Msg += "### Meta File Check Passed\n"
} }
err = NonASCIIFiles(rootDir) err = NonASCIIFiles(rootDir, whitelistedCharsCSV)
if err != nil { if err != nil {
res.Msg += fmt.Sprintf("### Non-ASCII Characters File Check Failed:\n%s\n", err.Error()) res.Msg += fmt.Sprintf("### Non-ASCII Characters File Check Failed:\n%s\n", err.Error())
res.Failed = true res.Failed = true

View File

@ -8,13 +8,49 @@ import (
"path/filepath" "path/filepath"
"strings" "strings"
"unicode" "unicode"
"unicode/utf8"
"github.com/go-git/go-git/v5/plumbing/format/gitattributes" "github.com/go-git/go-git/v5/plumbing/format/gitattributes"
) )
// Read the list of comma-separated allowed characters from command line and convert it to a hashmap.
func parseWhitelistedChars(csv string) map[rune]struct{} {
whitelist := make(map[rune]struct{})
if strings.TrimSpace(csv) == "" {
return whitelist
}
for _, raw := range strings.Split(csv, ",") {
elem := strings.TrimSpace(raw)
if elem == "" {
slog.Warn("ignoring invalid whitelisted-chars element", "element", raw, "reason", "empty element")
continue
}
if utf8.RuneCountInString(elem) != 1 {
slog.Warn("ignoring invalid whitelisted-chars element", "element", elem, "reason", "element must be exactly one character")
continue
}
ch, _ := utf8.DecodeRuneInString(elem)
if ch == utf8.RuneError {
slog.Warn("ignoring invalid whitelisted-chars element", "element", elem, "reason", "invalid utf-8 rune")
continue
}
if ch <= unicode.MaxASCII {
slog.Warn("ignoring invalid whitelisted-chars element", "element", elem, "reason", "ASCII characters are not allowed")
continue
}
whitelist[ch] = struct{}{}
}
return whitelist
}
// getNonASCII retrieves a list of files in the specified root directory that contain non-ASCII characters. // getNonASCII retrieves a list of files in the specified root directory that contain non-ASCII characters.
// It searches for non-ASCII characters in each file's content and returns a list of paths to files containing non-ASCII characters. // It searches for non-ASCII characters in each file's content and returns a list of paths to files containing non-ASCII characters.
func getNonASCII(root string) ([]string, error) { func getNonASCII(root string, whitelist map[rune]struct{}) ([]string, error) {
var nonASCII []string var nonASCII []string
gitattrExist := true gitattrExist := true
var matcher gitattributes.Matcher var matcher gitattributes.Matcher
@ -70,6 +106,9 @@ func getNonASCII(root string) ([]string, error) {
for scanner.Scan() { for scanner.Scan() {
cont := true cont := true
for _, c := range scanner.Text() { for _, c := range scanner.Text() {
if _, ok := whitelist[c]; ok {
continue
}
if c > unicode.MaxASCII { if c > unicode.MaxASCII {
nonASCII = append(nonASCII, "\t"+path) nonASCII = append(nonASCII, "\t"+path)
cont = false cont = false
@ -89,8 +128,10 @@ func getNonASCII(root string) ([]string, error) {
// NonASCIIFiles checks for non-ASCII characters in files within the specified root directory. // NonASCIIFiles checks for non-ASCII characters in files within the specified root directory.
// It prints a message with the paths to files containing non-ASCII characters, if any. // It prints a message with the paths to files containing non-ASCII characters, if any.
func NonASCIIFiles(root string) error { // Additionally it accept a list of whitelisted characters that are allowed, repo-wide.
nonASCII, err := getNonASCII(root) func NonASCIIFiles(root, whitelistedCharsCSV string) error {
whitelist := parseWhitelistedChars(whitelistedCharsCSV)
nonASCII, err := getNonASCII(root, whitelist)
if err != nil { if err != nil {
slog.Error("getting non-ascii", "err", err) slog.Error("getting non-ascii", "err", err)
return fmt.Errorf("error getting non-ascii: %w", err) return fmt.Errorf("error getting non-ascii: %w", err)