初始化仓库

This commit is contained in:
橙息
2025-09-15 17:47:43 +08:00
commit 32f10eb181
1105 changed files with 64088 additions and 0 deletions

182
voice_extractor.py Normal file
View File

@@ -0,0 +1,182 @@
from dataclasses import dataclass, asdict
from typing import List, Optional
import re
import json
import os
import glob
from collections import defaultdict
@dataclass
class NaninovelEntry:
id: str
source: str # 合并的多行原文
translation: str # 合并的多行翻译
source_plain: str # 清除格式后的原文
translation_plain: str # 清除格式后的翻译
voice_id: Optional[str] = None
character: Optional[str] = None
@staticmethod
def clean_text(text: str) -> str:
"""去除 HTML 标签及多余空格"""
# 去掉 <br> 等 HTML 标签
text = re.sub(r"<.*?>", "", text)
# 去掉全角空格和多余空行
text = re.sub(r"\s+", "", text)
return text.strip()
@dataclass
class ScriptMetadata:
id: str
file_path: str
file_dir: str
header: Optional[str] = None
class NaninovelScript:
# 所有支持的语音匹配正则及对应处理函数
VOICE_PATTERNS = [
# 新格式: ; > @printDebate ... |#voice_id|
(re.compile(r"> *@printDebate.*\|#([A-Za-z0-9_]+)\|"), lambda m: (None, m.group(1))),
# 旧格式: ; > Miria: |#voice_id|
(re.compile(r"> *(\w+): *\|#([A-Za-z0-9_]+)\|"), lambda m: (m.group(1), m.group(2))),
]
@staticmethod
def extract_character_from_voice_id(voice_id: str) -> Optional[str]:
# 从 voice_id 中自动识别角色名(如 0206Trial09_Yuki003 -> Yuki
match = re.search(r'_(\w+?)(\d+)?$', voice_id)
if match:
return match.group(1)
return None
def __init__(self, file_path: str):
self.file_path = file_path
file_dir = os.path.dirname(file_path)
self.id = os.path.splitext(os.path.basename(file_path))[0]
self.metadata = ScriptMetadata(file_path=self.file_path, id=self.id, file_dir=file_dir)
self.entries: List[NaninovelEntry] = []
self.other_remarks: List[str] = [] # 储存所有 ; > 开头的内容
self._parse()
def _parse(self):
current_entry_id = None
source_lines = []
translation_lines = []
character = None
voice_id = None
first_line = True
reading_translation = False
with open(self.file_path, "r", encoding="utf-8") as f:
for line in f:
line = line.rstrip()
if not line:
continue
if first_line and line.startswith(";"):
self.metadata.header = line[1:].strip()
first_line = False
continue
first_line = False
if line.startswith("#"):
if current_entry_id is not None:
source_text = "\n".join(source_lines)
translation_text = "\n".join(translation_lines)
self.entries.append(NaninovelEntry(
id=current_entry_id,
source=source_text,
translation=translation_text,
source_plain=NaninovelEntry.clean_text(source_text),
translation_plain=NaninovelEntry.clean_text(translation_text),
voice_id=voice_id,
character=character,
))
# 初始化新条目
current_entry_id = line[1:].strip()
source_lines = []
translation_lines = []
character = None
voice_id = None
reading_translation = False
elif line.startswith(";"):
content = line[1:].strip()
matched = False
for pattern, handler in self.VOICE_PATTERNS:
match = pattern.match(content)
if match:
char, vid = handler(match)
voice_id = vid
# 如果正则没有直接给出角色名,则尝试自动识别
character = char if char else self.extract_character_from_voice_id(voice_id)
matched = True
break
if matched:
continue # 语音信息行不计入其它备注
# 只保存未被上述匹配的 ; > 行
if content.startswith(">"):
self.other_remarks.append(content)
continue
# 其他以 ; 开头的行根据当前翻译状态加入源文本或译文
if not reading_translation:
source_lines.append(content)
else:
translation_lines.append(content)
else:
reading_translation = True
translation_lines.append(line)
# 保存最后一个条目
if current_entry_id is not None:
source_text = "\n".join(source_lines)
translation_text = "\n".join(translation_lines)
self.entries.append(NaninovelEntry(
id=current_entry_id,
source=source_text,
translation=translation_text,
source_plain=NaninovelEntry.clean_text(source_text),
translation_plain=NaninovelEntry.clean_text(translation_text),
voice_id=voice_id,
character=character,
))
def save_as_json(self, output_path: str):
"""使用 asdict 简化 JSON 导出"""
with open(output_path, "w", encoding="utf-8") as f:
json.dump({
"metadata": asdict(self.metadata),
"entries": [asdict(e) for e in self.entries]
}, f, ensure_ascii=False, indent=2)
# 使用示例
if __name__ == "__main__":
input_dir = r"D:\manosaba"
output_dir = r"D:\manosaba_voice_lists"
os.makedirs(output_dir, exist_ok=True)
character_map = defaultdict(list)
files = glob.glob(os.path.join(input_dir, "general-localization-*-scripts-*", "*.txt"))
for file in files:
print(f"Processing file: {file}")
script = NaninovelScript(file)
# 输出所有 ; > 开头的内容
if script.other_remarks:
print(f"> lines in {file}:")
for remark in script.other_remarks:
print(remark)
dir_name = script.metadata.file_dir.split('-')[-1]
if dir_name == "common_assets_all":
voice_dir = os.path.join(input_dir, f"general-voice-{script.id.lower()}_assets_all")
else:
voice_dir = os.path.join(input_dir, f"general-voice-{dir_name}")
for entry in script.entries:
if entry.character and entry.voice_id:
voice_path = os.path.join(voice_dir, entry.voice_id, f"{entry.voice_id}.wav")
if os.path.exists(voice_path):
character_map[entry.character].append(f"{voice_path}|slicer_opt|JP|{entry.source_plain}")
for character, voices in character_map.items():
with open(os.path.join(output_dir, f"{character}.list"), "w", encoding="utf-8") as f:
for voice in voices:
f.write(voice + "\n")