diff --git a/tools/data_convert.py b/tools/data_convert.py new file mode 100644 index 00000000..d11bf795 --- /dev/null +++ b/tools/data_convert.py @@ -0,0 +1,73 @@ + +import argparse +import shutil +from pathlib import Path + +import orjson +import xmltodict +from hypy_utils import write +from hypy_utils.tqdm_utils import pmap + + +def convert_one(file: Path): + # Get path relative to source + rel = file.relative_to(src) + + # If path is one-level under StreamingAssets, ignore it (e.g. StreamingAssets/A000/Data.xml) + if len(rel.parts) <= 2: + return + + # Read xml + xml = xmltodict.parse(file.read_text()) + + # There should only be one root element, expand it + assert len(xml) == 1, f'Expected 1 root element, got {len(xml)}' + xml = xml[list(xml.keys())[0]] + + # Remove @xmlns:xsi and @xmlns:xsd + if '@xmlns:xsi' in xml: + del xml['@xmlns:xsi'] + if '@xmlns:xsd' in xml: + del xml['@xmlns:xsd'] + + # Generate target file path + # Ignore the first segment of the relative path, and append to the destination + # Also collapse the single-item directory into the filename + # e.g. {src}/A000/music/music000001/Music.xml -> {dst}/music/000001.json + target = dst / '/'.join(rel.parts[1:-2]) + file_id = ''.join(filter(str.isdigit, rel.parts[-2])) + target = target / f'{file_id}.json' + + # Create directories if they don't exist + target.parent.mkdir(parents=True, exist_ok=True) + + # Write json + write(target, orjson.dumps(xml)) + + +if __name__ == '__main__': + agupa = argparse.ArgumentParser() + agupa.add_argument('source', type=str, help='Package/Sinmai_Data/StreamingAssets directory') + agupa.add_argument('destination', type=str, help='Directory to extract to') + args = agupa.parse_args() + + src = Path(args.source) + dst = Path(args.destination) + + # Assert that A000 exists in the source directory + assert (src / 'A000').exists(), f'{src}/A000 does not exist' + + # Assert that target directory does not exist + if dst.exists(): + if input(f'{dst} already exists, delete? (y/n): ') == 'y': + print(f'Deleting {dst}') + shutil.rmtree(dst) + + # Find all xml files in the source directory + files = list(src.rglob('*.xml')) + print(f'Found {len(files)} xml files') + + # Multithreaded map + pmap(convert_one, files, desc='Converting', unit='file', chunksize=50) + +