diff --git a/.cirrus.yml b/.cirrus.yml index d80d260c7..3178d7010 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -174,6 +174,14 @@ task: ELECTRUM_LINTERS_IGNORE: "" allow_failures: true +task: + name: "linter: ban unicode" + container: + image: python:3.10 + cpu: 1 + memory: 1G + main_script: + - contrib/ban_unicode.py # Cron jobs configured in https://cirrus-ci.com/settings/... # - job "nightly" on branch "master" at "0 30 2 * * ?" (every day at 02:30Z) diff --git a/contrib/ban_unicode.py b/contrib/ban_unicode.py new file mode 100755 index 000000000..9dabcc245 --- /dev/null +++ b/contrib/ban_unicode.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2025 The Electrum developers +# Distributed under the MIT software license, see the accompanying +# file LICENCE or http://www.opensource.org/licenses/mit-license.php +# +# This script scans the whole codebase for unicode characters and +# errors if it finds any, unless the character is specifically whitelisted below. +# The motivation is to protect against homoglyph attacks, invisible unicode characters, +# bidirectional and other control characters, and other malicious unicode usage. +# Given that we mostly expect to use ASCII characters in the source code, +# the most robust and generic fix seems to be to just ban all unicode usage. + +import os.path +import subprocess +import sys + +project_root = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) +os.chdir(project_root) + +EXCLUDE_PATH_PREFIX = { + "electrum/wordlist/", + "fastlane/", + "tests/", +} +EXCLUDE_EXTENSIONS = { + ".jpg", ".jpeg", ".png", ".ttf", ".otf", ".pdn", ".icns", ".ico", ".gif", +} +UNICODE_WHITELIST = { + "๐Ÿ’ฌ", "๐Ÿ—ฏ", "โš ", chr(0xfe0f), "โœ“", "โ–ท", "โ–ฝ", "โ€ฆ", "โ€ข", "โ–ˆ", "โ„ข", "โ‰ˆ", + "รก", "รฉ", "โ€™", + "โ”‚", "โ”€", "โ””", "โ”œ", +} + +exit_code = 0 + +bfiles = subprocess.check_output(["git", "ls-files"]) +bfiles = bfiles.decode("utf-8") +for file_path in bfiles.splitlines(): + if os.path.isdir(file_path): + continue + if any(file_path.startswith(pattern) for pattern in EXCLUDE_PATH_PREFIX): + continue + _fname, ext = os.path.splitext(file_path) + if ext in EXCLUDE_EXTENSIONS: + continue + # open file + try: + with open(file_path, "r", encoding="utf-8") as f: + for line_no, line in enumerate(f.read().splitlines()): + for char in line: + if ord(char)>0x7f and char not in UNICODE_WHITELIST: + print(f"{file_path}:{line_no}. {line=}. hex={hex(ord(char))}. {char=}") + exit_code = 1 + except UnicodeDecodeError as e: + raise Exception(f"cannot parse file {file_path=}") from e + +sys.exit(exit_code)