feat(i18n): add locale key sorter and pre-commit hook

Add script/i18n/sort_locales.py (ruamel.yaml round-trip) that sorts mapping
keys into yamllint key-ordering while preserving comments, quoting, block
scalars and the license header. It splits off the document header verbatim,
re-anchors own-line comments to the key they precede, normalizes end-of-line
comment spacing to >= 2 spaces, sorts by the literal YAML scalar text (so
true/false keys match yamllint), and refuses to write unless key-paths and
values are unchanged.

Wire it into a lefthook pre-commit hook so the yamllint-checked locale files
stay sorted; the existing reviewdog yamllint check remains the CI gate.
requirements.txt pins only the runtime dependency (ruamel.yaml); pytest is a
dev-only tool and is not pinned.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
David F
2026-06-11 10:41:45 +02:00
parent 94e8490855
commit 29de1eaeb3
4 changed files with 441 additions and 0 deletions
+5
View File
@@ -31,3 +31,8 @@ pre-commit:
files: git diff --name-only --staged
glob: "{Gemfile.lock,frontend/package.json}"
run: script/check_same_primer_view_components_version_everywhere
sort-locales:
files: git diff --name-only --staged
glob: "**/config/locales/{en,js-en}.yml"
run: python3 script/i18n/sort_locales.py {files}
stage_fixed: true
+6
View File
@@ -0,0 +1,6 @@
# Runtime dependency for script/i18n/sort_locales.py and the lefthook pre-commit hook.
# Install with: pip install -r script/i18n/requirements.txt
ruamel.yaml==0.18.10
# Running the tests additionally needs pytest (a dev-only tool, intentionally
# not pinned here): pip install pytest && python3 -m pytest script/i18n/
+219
View File
@@ -0,0 +1,219 @@
#!/usr/bin/env python3
"""Sort keys in OpenProject locale YAML files to satisfy yamllint's key-ordering.
Sorts mapping keys recursively in Unicode codepoint order (matching yamllint's
strcoll comparison under CI's C/POSIX locale for the ASCII keys these files use).
Sequences and scalar values are left untouched. Comments, quoting and block
scalars are preserved via ruamel.yaml round-trip.
Usage:
python3 script/i18n/sort_locales.py FILE [FILE ...]
"""
from __future__ import annotations
import io
import sys
from pathlib import Path
try:
from ruamel.yaml import YAML
from ruamel.yaml.comments import CommentedMap, CommentedSeq
except ModuleNotFoundError:
sys.stderr.write(
"sort_locales.py requires ruamel.yaml. Install it with:\n"
" pip install -r script/i18n/requirements.txt\n"
)
sys.exit(1)
def make_yaml() -> YAML:
yaml = YAML()
yaml.preserve_quotes = True
yaml.width = 4096
yaml.indent(mapping=2, sequence=4, offset=2)
return yaml
def _sort_key(key):
"""Return the YAML scalar text yamllint compares against.
ruamel parses unquoted true/false/null as Python objects, but yamllint
compares the literal scalar text, so map them back to what gets written.
"""
if isinstance(key, bool):
return "true" if key else "false"
if key is None:
return "null"
return str(key)
def _bare(comment_line: str) -> str:
"""'# foo' -> 'foo', '#foo' -> 'foo', '#' -> ''. ruamel's comment APIs re-add '# '."""
s = comment_line.lstrip()[1:] # drop leading whitespace and the '#'
return s[1:] if s.startswith(" ") else s
def _split_post(value: str):
"""Split a ruamel post-comment token value into (eol, [own_lines]).
`eol` is the comment on the same line as the key's value (or None);
`own_lines` are the own-line comments that follow it (which visually
precede the next key). All returned text is bare (no leading '#').
"""
segments = value.split("\n")
eol = None
own_lines = []
starts_inline = not value.startswith("\n")
for index, segment in enumerate(segments):
stripped = segment.strip()
if not stripped.startswith("#"):
continue
if index == 0 and starts_inline:
eol = _bare(stripped)
else:
own_lines.append(_bare(stripped))
return eol, own_lines
def reanchor_comments(node, child_indent: int = 0) -> None:
"""Re-attach own-line comments to the key they precede, so they travel
with that key when keys are reordered. Run before sort_node."""
if isinstance(node, CommentedMap):
for key in list(node.keys()):
reanchor_comments(node[key], child_indent + 2)
keys = list(node.keys())
# NOTE: we deliberately do NOT touch a mapping's leading comment
# (node.ca.comment). For nested mappings ruamel also stores that comment
# on the parent's `ca.items[key]`, which travels with the key on reorder;
# moving it here would render it twice. The only unanchored leading
# comment is the root document header, whose mapping has a single key
# (`en`) and never reorders.
# Each key's following own-line comments -> before the next key.
for index, key in enumerate(keys):
item = node.ca.items.get(key)
if not item or item[2] is None:
continue
eol, own_lines = _split_post(item[2].value)
if not own_lines or index + 1 >= len(keys):
continue # nothing to move, or trailing comments at mapping end
node.ca.items[key][2] = None
if eol is not None:
node.yaml_add_eol_comment(eol, key)
node.yaml_set_comment_before_after_key(
keys[index + 1], before="\n".join(own_lines), indent=child_indent)
elif isinstance(node, CommentedSeq):
for item in node:
reanchor_comments(item, child_indent + 2)
def sort_node(node) -> None:
"""Recursively sort mapping keys in place by codepoint order."""
if isinstance(node, CommentedMap):
for key in list(node.keys()):
sort_node(node[key])
for key in sorted(node.keys(), key=_sort_key):
node.move_to_end(key)
elif isinstance(node, CommentedSeq):
for item in node:
sort_node(item)
def flatten(node, prefix=()):
"""Yield (path, value) for every leaf; order-independent."""
if isinstance(node, dict):
for key, value in node.items():
yield from flatten(value, prefix + (str(key),))
elif isinstance(node, list):
for index, value in enumerate(node):
yield from flatten(value, prefix + (f"[{index}]",))
else:
yield prefix, node
def _normalize_eol_comment_spacing(text: str) -> str:
"""Ensure at least 2 spaces before end-of-line comments (yamllint's
comments rule). Comment positions are taken from a ruamel parse, so a
'#' inside a string value is never touched."""
data = make_yaml().load(text)
targets = [] # (0-based line, 0-based column of '#')
def visit(node):
ca = getattr(node, "ca", None)
if ca is not None:
for _key, item in ca.items.items():
token = item[2] if item else None
if token is not None and not token.value.startswith("\n"):
targets.append((token.start_mark.line, token.start_mark.column))
if isinstance(node, dict):
for value in node.values():
visit(value)
elif isinstance(node, list):
for value in node:
visit(value)
if data is not None:
visit(data)
lines = text.split("\n")
for line_no, col in targets:
if line_no >= len(lines):
continue
line = lines[line_no]
if col < 1 or col > len(line) or line[col] != "#":
continue
start = col
while start > 0 and line[start - 1] == " ":
start -= 1
# only an end-of-line comment (content precedes the spaces), under-spaced
if start > 0 and (col - start) < 2:
lines[line_no] = line[:start] + " " + line[col:]
return "\n".join(lines)
def sort_file(path: str) -> None:
yaml = make_yaml()
text = Path(path).read_text()
# Preserve everything up to the root `en:` line verbatim: the license
# header and any `---` document-start marker. ruamel does not reliably
# round-trip pre-document leading comments, so we never hand them to it.
lines = text.splitlines(keepends=True)
body_start = next(
(i for i, line in enumerate(lines) if line.rstrip("\n") == "en:"), None)
if body_start is None:
return # no recognizable root mapping; leave untouched
preamble = "".join(lines[:body_start])
body = "".join(lines[body_start:])
data = yaml.load(body)
if data is None:
return
before = dict(flatten(data))
reanchor_comments(data)
sort_node(data)
after = dict(flatten(data))
if before != after:
raise SystemExit(
f"{path}: refusing to write — sorting changed content, not just order"
)
buffer = io.StringIO()
yaml.dump(data, buffer)
body_out = _normalize_eol_comment_spacing(buffer.getvalue())
body_out = body_out.rstrip("\n") + "\n" # exactly one trailing newline
Path(path).write_text(preamble + body_out)
def main(argv: list[str]) -> int:
for path in argv[1:]:
sort_file(path)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))
+211
View File
@@ -0,0 +1,211 @@
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
import sort_locales # noqa: E402
def run_sort(tmp_path, text):
f = tmp_path / "en.yml"
f.write_text(text)
sort_locales.sort_file(str(f))
return f.read_text()
def test_sorts_top_level_and_nested_keys(tmp_path):
out = run_sort(tmp_path, (
"en:\n"
" banana: \"B\"\n"
" apple:\n"
" zebra: 2\n"
" aardvark: 1\n"
))
assert out.index("apple:") < out.index("banana:")
assert out.index("aardvark:") < out.index("zebra:")
def test_own_line_comment_before_non_first_key_moves_with_it(tmp_path):
out = run_sort(tmp_path, (
"en:\n"
" zebra: 1\n"
" # note about alpha\n"
" alpha: 2\n"
))
assert out.index("alpha:") < out.index("zebra:")
# the comment travels with alpha and stays directly above it
assert "# note about alpha\n alpha:" in out
def test_leading_comment_before_first_key_stays_at_block_top(tmp_path):
# Documented behavior: a comment before the FIRST key of a mapping is
# treated as a block header and stays at the top after sorting, rather
# than following its original first key. (Dedent/first-key comments are
# not auto-relocated; they're hand-fixed during the one-time sort.)
out = run_sort(tmp_path, (
"en:\n"
" # block header\n"
" beta: 2\n"
" alpha: 1\n"
))
assert out.index("alpha:") < out.index("beta:")
assert out.index("# block header") < out.index("alpha:")
def test_sorts_quoted_keys_by_unquoted_value(tmp_path):
out = run_sort(tmp_path, (
"en:\n"
" \"zzz\": 1\n"
" \"import/jira\": 2\n"
" aaa: 3\n"
))
# codepoint order: aaa (97) < import/jira (105) < zzz (122)
assert out.index("aaa:") < out.index("import/jira") < out.index("zzz")
assert '"import/jira"' in out # original quoting preserved
def test_preserves_block_scalars(tmp_path):
out = run_sort(tmp_path, (
"en:\n"
" zebra: \"Z\"\n"
" alpha: |\n"
" multi\n"
" line\n"
))
assert out.index("alpha:") < out.index("zebra:")
assert "|" in out
assert " multi\n line" in out
def test_multiline_comment_block_moves_with_following_key(tmp_path):
out = run_sort(tmp_path, (
"en:\n"
" zebra: 1\n"
" # explains alpha line 1\n"
" # explains alpha line 2\n"
" alpha: 2\n"
))
# the whole block stays directly above alpha, which sorts first
assert "# explains alpha line 1\n # explains alpha line 2\n alpha:" in out
assert out.index("alpha:") < out.index("zebra:")
def test_eol_comment_stays_with_its_key_and_own_line_moves(tmp_path):
out = run_sort(tmp_path, (
"en:\n"
" zebra: 1 # eol on zebra\n"
" # describes alpha\n"
" alpha: 2\n"
))
# eol comment remains on zebra's line; own-line comment moves above alpha
assert "zebra: 1 # eol on zebra" in out
assert "# describes alpha\n alpha:" in out
assert out.index("alpha:") < out.index("zebra:")
import pytest # noqa: E402
from ruamel.yaml import YAML # noqa: E402
def _load(text):
return YAML().load(text)
def test_preserves_all_key_paths_and_values(tmp_path):
src = (
"en:\n"
" user:\n"
" display_format: \"Display format\"\n"
" deletion: \"Deletion\"\n"
" activities:\n"
" index:\n"
" title: \"T\"\n"
)
out = run_sort(tmp_path, src)
before = dict(sort_locales.flatten(_load(src)))
after = dict(sort_locales.flatten(_load(out)))
assert before == after # same key-paths and values, order aside
def test_duplicate_keys_raise(tmp_path):
with pytest.raises(Exception):
run_sort(tmp_path, (
"en:\n"
" alpha: 1\n"
" alpha: 2\n"
))
def _assert_keys_sorted(node):
if isinstance(node, dict):
keys = [sort_locales._sort_key(k) for k in node.keys()]
assert keys == sorted(keys), f"unsorted mapping: {keys}"
for value in node.values():
_assert_keys_sorted(value)
elif isinstance(node, list):
for value in node:
_assert_keys_sorted(value)
def test_output_is_yamllint_ordered(tmp_path):
out = run_sort(tmp_path, (
"en:\n"
" gamma: 3\n"
" alpha:\n"
" delta: 1\n"
" beta: 2\n"
" bool_keys:\n"
" true: t\n"
" false: f\n"
))
_assert_keys_sorted(_load(out))
# boolean keys sort as written: false before true
assert out.index("false:") < out.index("true:")
def test_preserves_license_header_and_document_marker(tmp_path):
header = (
"#-- copyright\n"
"# OpenProject is an open source project management software.\n"
"#++\n"
"\n"
"---\n"
)
out = run_sort(tmp_path, header + (
"en:\n"
" zebra: 1\n"
" alpha: 2\n"
))
# header + marker preserved verbatim and still at the very top
assert out.startswith(header)
assert out.index("alpha:") < out.index("zebra:")
def test_single_trailing_newline(tmp_path):
out = run_sort(tmp_path, "en:\n b: 1\n a: 2\n\n\n")
assert out.endswith("\n")
assert not out.endswith("\n\n")
def test_normalizes_eol_comment_spacing(tmp_path):
out = run_sort(tmp_path, (
"en:\n"
" zebra: \"Z\" # one space before comment\n"
" alpha: \"a # b is not a comment\"\n"
))
# the real eol comment gets two spaces; the '#' inside the string is untouched
assert '"Z" # one space before comment' in out
assert '"a # b is not a comment"' in out
assert out.index("alpha:") < out.index("zebra:")
def test_idempotent(tmp_path):
src = (
"en:\n"
" gamma: 3\n"
" # note for alpha\n"
" alpha: 1\n"
" beta: 2\n"
)
once = run_sort(tmp_path, src)
twice = run_sort(tmp_path, once)
assert once == twice