Merge pull request #9801 from SomberNight/202505_ban_unicode

ci: add linter task "ban unicode" to protect against malicious unicode
2025-05-19 15:02:45 +02:00
parent 000b1bb3b5 2ce72d3cab
commit 8396a22286
3 changed files with 67 additions and 1 deletions
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -174,6 +174,14 @@ task:
        ELECTRUM_LINTERS_IGNORE: ""
      allow_failures: true

+task:
+  name: "linter: ban unicode"
+  container:
+    image: python:3.10
+    cpu: 1
+    memory: 1G
+  main_script:
+    - contrib/ban_unicode.py

 # Cron jobs configured in https://cirrus-ci.com/settings/...
 # - job "nightly" on branch "master" at "0 30 2 * * ?"  (every day at 02:30Z)
--- a/contrib/ban_unicode.py
+++ b/contrib/ban_unicode.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2025 The Electrum developers
+# Distributed under the MIT software license, see the accompanying
+# file LICENCE or http://www.opensource.org/licenses/mit-license.php
+#
+# This script scans the whole codebase for unicode characters and
+# errors if it finds any, unless the character is specifically whitelisted below.
+# The motivation is to protect against homoglyph attacks, invisible unicode characters,
+# bidirectional and other control characters, and other malicious unicode usage.
+# Given that we mostly expect to use ASCII characters in the source code,
+# the most robust and generic fix seems to be to just ban all unicode usage.
+
+import os.path
+import subprocess
+import sys
+
+project_root = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+os.chdir(project_root)
+
+EXCLUDE_PATH_PREFIX = {
+    "electrum/wordlist/",
+    "fastlane/",
+    "tests/",
+}
+EXCLUDE_EXTENSIONS = {
+    ".jpg", ".jpeg", ".png", ".ttf", ".otf", ".pdn", ".icns", ".ico", ".gif",
+}
+UNICODE_WHITELIST = {
+    "💬", "🗯", "⚠", chr(0xfe0f), "✓", "▷", "▽", "…", "•", "█", "™", "≈",
+    "á", "é", "’",
+    "│", "─", "└", "├",
+}
+
+exit_code = 0
+
+bfiles = subprocess.check_output(["git", "ls-files"])
+bfiles = bfiles.decode("utf-8")
+for file_path in bfiles.splitlines():
+    if os.path.isdir(file_path):
+        continue
+    if any(file_path.startswith(pattern) for pattern in EXCLUDE_PATH_PREFIX):
+        continue
+    _fname, ext = os.path.splitext(file_path)
+    if ext in EXCLUDE_EXTENSIONS:
+        continue
+    # open file
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line_no, line in enumerate(f.read().splitlines()):
+                for char in line:
+                    if ord(char)>0x7f and char not in UNICODE_WHITELIST:
+                        print(f"{file_path}:{line_no}. {line=}. hex={hex(ord(char))}. {char=}")
+                        exit_code = 1
+    except UnicodeDecodeError as e:
+        raise Exception(f"cannot parse file {file_path=}") from e
+
+sys.exit(exit_code)
--- a/electrum/gui/fonts/PTMono.LICENSE
+++ b/electrum/gui/fonts/PTMono.LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2011, ParaType Ltd. (http://www.paratype.com/public),
+Copyright (c) 2011, ParaType Ltd. (http://www.paratype.com/public),
 with Reserved Font Names "PT Sans", "PT Serif", "PT Mono" and "ParaType".

 This Font Software is licensed under the SIL Open Font License, Version 1.1.