Files

422 lines
14 KiB
Python
Raw Permalink Normal View History

2026-04-05 16:23:20 +02:00
#!/usr/bin/env python3
"""
One-off maintenance script for syncing translated Wino `resources.json` files
with `en_US/resources.json` and bulk-translating missing keys via the OpenAI API.
By default this script runs in dry-run mode and only reports the planned changes.
Use `--apply` to write updates.
Examples:
python scripts/translate_resources.py --dry-run
python scripts/translate_resources.py --apply --model gpt-5-nano
python scripts/translate_resources.py --apply --locales pl_PL de_DE --chunk-size 120
2026-04-05 16:25:48 +02:00
Usage:
$env:OPENAI_API_KEY="{open ai key here}"
python .\scripts\translate_resources.py --dry-run
python .\scripts\translate_resources.py --apply --model gpt-5-nano --workers 4
2026-04-05 16:23:20 +02:00
"""
from __future__ import annotations
import argparse
import concurrent.futures
import json
import os
import sys
import time
import urllib.error
import urllib.request
from collections import OrderedDict
from pathlib import Path
from typing import Dict, Iterable, List, Sequence, Tuple
LOCALE_LABELS = {
"bg_BG": "Bulgarian (Bulgaria)",
"ca_ES": "Catalan (Spain)",
"cs_CZ": "Czech (Czech Republic)",
"da_DK": "Danish (Denmark)",
"de_DE": "German (Germany)",
"el_GR": "Greek (Greece)",
"es_ES": "Spanish (Spain)",
"fi_FI": "Finnish (Finland)",
"fr_FR": "French (France)",
"gl_ES": "Galician (Spain)",
"id_ID": "Indonesian (Indonesia)",
"it_IT": "Italian (Italy)",
"ja_JP": "Japanese (Japan)",
2026-04-23 13:23:05 +02:00
"ko_KR": "Korean (South Korea)",
2026-04-05 16:23:20 +02:00
"lt_LT": "Lithuanian (Lithuania)",
"nl_NL": "Dutch (Netherlands)",
"pl_PL": "Polish (Poland)",
"pt_BR": "Portuguese (Brazil)",
"ro_RO": "Romanian (Romania)",
"ru_RU": "Russian (Russia)",
"sk_SK": "Slovak (Slovakia)",
"tr_TR": "Turkish (Turkey)",
"uk_UA": "Ukrainian (Ukraine)",
"zh_CN": "Chinese, Simplified (China)",
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Bulk-sync and translate Wino resources.json files.")
parser.add_argument(
"--translations-root",
default=str(Path("Wino.Core.Domain") / "Translations"),
help="Path to the translations root directory.",
)
parser.add_argument(
"--source-locale",
default="en_US",
help="Source locale directory name.",
)
parser.add_argument(
"--locales",
nargs="*",
help="Specific locale directory names to process. Defaults to all non-source locales.",
)
parser.add_argument(
"--model",
default="gpt-5-nano",
help="OpenAI model name to use for translation.",
)
parser.add_argument(
"--chunk-size",
type=int,
default=100,
help="How many keys to translate per API request.",
)
parser.add_argument(
"--workers",
type=int,
default=4,
help="How many locales to process in parallel.",
)
parser.add_argument(
"--max-retries",
type=int,
default=4,
help="Maximum retries per translation chunk.",
)
parser.add_argument(
"--api-key-env",
default="OPENAI_API_KEY",
help="Environment variable that stores the OpenAI API key.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Report planned changes without writing files.",
)
parser.add_argument(
"--apply",
action="store_true",
help="Write the translated files back to disk.",
)
args = parser.parse_args()
if args.apply == args.dry_run:
parser.error("Choose exactly one of --apply or --dry-run.")
if args.chunk_size < 1:
parser.error("--chunk-size must be greater than 0.")
if args.workers < 1:
parser.error("--workers must be greater than 0.")
if args.max_retries < 1:
parser.error("--max-retries must be greater than 0.")
return args
def read_json(path: Path) -> OrderedDict[str, str]:
with path.open("r", encoding="utf-8-sig", newline="") as handle:
return json.load(handle, object_pairs_hook=OrderedDict)
def has_utf8_bom(path: Path) -> bool:
return path.read_bytes().startswith(b"\xef\xbb\xbf")
def write_json(path: Path, data: OrderedDict[str, str], include_bom: bool) -> None:
encoding = "utf-8-sig" if include_bom else "utf-8"
text = json.dumps(data, ensure_ascii=False, indent=4) + "\n"
with path.open("w", encoding=encoding, newline="\r\n") as handle:
handle.write(text)
def chunk_items(items: Sequence[Tuple[str, str]], chunk_size: int) -> Iterable[List[Tuple[str, str]]]:
for index in range(0, len(items), chunk_size):
yield list(items[index : index + chunk_size])
def build_schema(keys: Sequence[str]) -> Dict[str, object]:
properties = {key: {"type": "string"} for key in keys}
return {
"name": "resource_translation_batch",
"strict": True,
"schema": {
"type": "object",
"additionalProperties": False,
"properties": properties,
"required": list(keys),
},
}
def extract_message_content(payload: Dict[str, object]) -> str:
try:
return payload["choices"][0]["message"]["content"]
except (KeyError, IndexError, TypeError) as exc:
raise RuntimeError(f"Unexpected OpenAI response shape: {payload}") from exc
def call_openai_chat(
*,
api_key: str,
model: str,
locale: str,
language_label: str,
entries: Sequence[Tuple[str, str]],
) -> Dict[str, str]:
keys = [key for key, _ in entries]
prompt_lines = [
f"Translate the following UI localization values from English to {language_label}.",
"Return only translated string values for the provided keys.",
"Keep the keys exactly the same.",
"Preserve placeholders and formatting exactly, including but not limited to:",
"- {0}, {1}, {Name}, {{escaped braces}}",
"- %s, %1$s and similar printf-style tokens",
"- Ellipses, punctuation, capitalization, line breaks, tabs, HTML, Markdown, and URLs",
"- Product and protocol names such as Wino, Gmail, Outlook, IMAP, SMTP, OAuth, CalDav, Edge, Chrome, Firefox, Jodit, WebView2, SQLite",
"Prefer natural UI phrasing for the target locale.",
f"Locale code: {locale}",
"",
"Strings to translate:",
]
for key, value in entries:
prompt_lines.append(f"{key}: {value}")
body = {
"model": model,
"messages": [
{
"role": "system",
"content": (
"You are a software localization engine. "
"Translate accurately and conservatively. "
"Never omit keys. Never add commentary."
),
},
{"role": "user", "content": "\n".join(prompt_lines)},
],
"response_format": {
"type": "json_schema",
"json_schema": build_schema(keys),
},
}
request = urllib.request.Request(
url="https://api.openai.com/v1/chat/completions",
data=json.dumps(body).encode("utf-8"),
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
method="POST",
)
try:
with urllib.request.urlopen(request, timeout=180) as response:
payload = json.loads(response.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
details = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"OpenAI API HTTP {exc.code}: {details}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"OpenAI API request failed: {exc}") from exc
content = extract_message_content(payload)
result = json.loads(content)
if sorted(result.keys()) != sorted(keys):
raise RuntimeError(
"Translated batch returned the wrong key set. "
f"Expected {len(keys)} keys, got {len(result)} keys."
)
return {key: result[key] for key in keys}
def translate_missing_entries(
*,
api_key: str,
model: str,
locale: str,
entries: Sequence[Tuple[str, str]],
chunk_size: int,
max_retries: int,
) -> Dict[str, str]:
if not entries:
return {}
translated: Dict[str, str] = {}
language_label = LOCALE_LABELS.get(locale, locale)
for chunk_index, chunk in enumerate(chunk_items(entries, chunk_size), start=1):
last_error: Exception | None = None
for attempt in range(1, max_retries + 1):
try:
result = call_openai_chat(
api_key=api_key,
model=model,
locale=locale,
language_label=language_label,
entries=chunk,
)
translated.update(result)
print(
f"[{locale}] translated chunk {chunk_index} "
f"({len(chunk)} keys) on attempt {attempt}",
flush=True,
)
break
except Exception as exc: # noqa: BLE001
last_error = exc
if attempt == max_retries:
raise RuntimeError(
f"Failed translating chunk {chunk_index} for {locale}: {exc}"
) from exc
wait_seconds = attempt * 2
print(
f"[{locale}] retrying chunk {chunk_index} after error: {exc}",
flush=True,
)
time.sleep(wait_seconds)
return translated
def process_locale(
locale: str,
source_data: OrderedDict[str, str],
translations_root: Path,
args: argparse.Namespace,
api_key: str | None,
) -> Dict[str, object]:
path = translations_root / locale / "resources.json"
if not path.exists():
raise FileNotFoundError(f"Locale file not found: {path}")
target_data = read_json(path)
include_bom = has_utf8_bom(path)
source_keys = list(source_data.keys())
source_key_set = set(source_keys)
target_key_set = set(target_data.keys())
missing_keys = [key for key in source_keys if key not in target_key_set]
extra_keys = [key for key in target_data.keys() if key not in source_key_set]
translated_missing: Dict[str, str] = {}
if missing_keys and args.apply:
if not api_key:
raise RuntimeError(
f"Missing API key. Set the {args.api_key_env} environment variable before using --apply."
)
missing_entries = [(key, source_data[key]) for key in missing_keys]
translated_missing = translate_missing_entries(
api_key=api_key,
model=args.model,
locale=locale,
entries=missing_entries,
chunk_size=args.chunk_size,
max_retries=args.max_retries,
)
merged = OrderedDict()
for key in source_keys:
if key in target_data:
merged[key] = target_data[key]
elif key in translated_missing:
merged[key] = translated_missing[key]
else:
merged[key] = source_data[key]
if args.apply:
write_json(path, merged, include_bom=include_bom)
return {
"locale": locale,
"path": str(path),
"existing": len(source_keys) - len(missing_keys),
"missing": len(missing_keys),
"extra": len(extra_keys),
"translated": len(translated_missing),
"wrote": args.apply,
}
def discover_locales(translations_root: Path, source_locale: str, requested: Sequence[str] | None) -> List[str]:
if requested:
return list(requested)
locales = []
for entry in sorted(translations_root.iterdir()):
if entry.is_dir() and entry.name != source_locale and (entry / "resources.json").exists():
locales.append(entry.name)
return locales
def main() -> int:
args = parse_args()
translations_root = Path(args.translations_root)
source_path = translations_root / args.source_locale / "resources.json"
if not source_path.exists():
print(f"Source file not found: {source_path}", file=sys.stderr)
return 1
source_data = read_json(source_path)
locales = discover_locales(translations_root, args.source_locale, args.locales)
if not locales:
print("No target locales found.", file=sys.stderr)
return 1
api_key = os.environ.get(args.api_key_env)
print(
f"Processing {len(locales)} locale(s) from {source_path} "
f"using model {args.model} in {'apply' if args.apply else 'dry-run'} mode.",
flush=True,
)
results: List[Dict[str, object]] = []
with concurrent.futures.ThreadPoolExecutor(max_workers=min(args.workers, len(locales))) as executor:
future_map = {
executor.submit(process_locale, locale, source_data, translations_root, args, api_key): locale
for locale in locales
}
for future in concurrent.futures.as_completed(future_map):
locale = future_map[future]
try:
result = future.result()
except Exception as exc: # noqa: BLE001
print(f"[{locale}] failed: {exc}", file=sys.stderr, flush=True)
return 1
results.append(result)
print(
f"[{result['locale']}] missing={result['missing']} "
f"extra={result['extra']} translated={result['translated']} "
f"mode={'write' if result['wrote'] else 'preview'}",
flush=True,
)
results.sort(key=lambda item: item["locale"])
total_missing = sum(int(item["missing"]) for item in results)
total_extra = sum(int(item["extra"]) for item in results)
total_translated = sum(int(item["translated"]) for item in results)
print(
f"Done. locales={len(results)} missing={total_missing} "
f"extra={total_extra} translated={total_translated}",
flush=True,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())