diff --git a/Makefile b/Makefile index 6ea3a401..b6d6de3d 100644 --- a/Makefile +++ b/Makefile @@ -88,7 +88,12 @@ GENERATED = \ schemas/xbrl/utr/volume-per-monetary-item-type.json \ schemas/iso/country/2020/alpha-2.json \ schemas/iso/country/2020/alpha-3.json \ - schemas/iso/country/2020/numeric.json + schemas/iso/country/2020/numeric.json \ + schemas/iso/language/2023/set-1.json \ + schemas/iso/language/2023/set-2-bibliographic.json \ + schemas/iso/language/2023/set-2-terminologic.json \ + schemas/iso/language/2023/set-3.json \ + schemas/iso/language/2023/set-5.json # TODO: Make `jsonschema fmt` automatically detect test files all: common test @@ -152,7 +157,6 @@ schemas/iso/currency/2015/%.json: \ $(JSONSCHEMA) fmt $@ build/xbrl/utr/%.json: scripts/xml2json.py vendor/data/xbrl/utr/%.xml - $(MKDIRP) $(dir $@) $(PYTHON) $< $(word 2,$^) $@ schemas/xbrl/utr/%.json: build/xbrl/utr/utr.json templates/xbrl/utr/%.jq $(MKDIRP) $(dir $@) @@ -166,6 +170,26 @@ schemas/iso/country/2020/%.json: \ $(JQ) --from-file $(word 2,$^) $< > $@ $(JSONSCHEMA) fmt $@ +build/iso/language/iso-639-2.json: \ + vendor/data/iso/language/ISO-639-2_utf-8.txt \ + scripts/csv2json.py + $(PYTHON) $(word 2,$^) --delimiter '|' --encoding utf-8-sig --no-header \ + --field-names "part2b,part2t,part1,name,name_french" $< $@ +build/iso/language/iso-639-3.json: \ + vendor/data/iso/language/iso-639-3_Code_Tables/iso-639-3_Code_Tables_20251015/iso-639-3.tab \ + scripts/csv2json.py + $(PYTHON) $(word 2,$^) --tab $< $@ +build/iso/language/enriched.json: \ + build/iso/language/iso-639-2.json \ + build/iso/language/iso-639-3.json \ + scripts/iso-language-enrich.jq + $(JQ) --slurpfile iso2 $< --slurpfile iso3 $(word 2,$^) -n -f $(word 3,$^) > $@ +schemas/iso/language/2023/%.json: \ + build/iso/language/enriched.json \ + templates/iso/language/2023/%.jq + $(MKDIRP) $(dir $@) + $(JQ) --from-file $(word 2,$^) $< > $@ + $(JSONSCHEMA) fmt $@ # TODO: Add a `jsonschema pkg` command instead .PHONY: dist diff --git a/generate/iso/language/main.py b/generate/iso/language/main.py deleted file mode 100644 index 926e9b85..00000000 --- a/generate/iso/language/main.py +++ /dev/null @@ -1,360 +0,0 @@ -import json -import os -import sys -import csv - - -def parse_iso_639_2(file_path, set_3_lookup_by_part1, set_3_lookup_by_part2b, set_3_lookup_by_part2t): - """Parse ISO-639-2 file which contains Set 1, Set 2, and Set 5.""" - set_1 = {} - set_2_bibliographic = {} - set_2_terminologic = {} - set_5 = {} - - with open(file_path, 'r', encoding='utf-8-sig') as file: - for line in file: - line = line.strip() - if not line: - continue - - parts = line.split('|') - if len(parts) < 5: - continue - - part2b = parts[0].strip() - part2t = parts[1].strip() - part1 = parts[2].strip() - english_name = parts[3].strip() - french_name = parts[4].strip() - - # Determine if this is Set 5 (language families/groups) - is_language_family = 'languages' in english_name.lower() or 'language family' in english_name.lower() - - # Set 5 entries - if is_language_family and part2b: - set_5[part2b] = { - 'name': english_name, - 'name_french': french_name - } - else: - # Set 1 entries - if part1: - set_3_data = set_3_lookup_by_part1.get(part1, {}) - set_1[part1] = { - 'name': english_name, - 'name_french': french_name, - 'part2b': part2b if part2b else None, - 'part2t': part2t if part2t else None, - 'scope': set_3_data.get('scope'), - 'language_type': set_3_data.get('language_type') - } - - # Set 2 entries - if part2b: - set_3_data = set_3_lookup_by_part2b.get(part2b, {}) - set_2_bibliographic[part2b] = { - 'name': english_name, - 'name_french': french_name, - 'part1': part1 if part1 else None, - 'scope': set_3_data.get('scope'), - 'language_type': set_3_data.get('language_type') - } - - if part2t: - set_3_data = set_3_lookup_by_part2t.get(part2t, {}) - set_2_terminologic[part2t] = { - 'name': english_name, - 'name_french': french_name, - 'part1': part1 if part1 else None, - 'scope': set_3_data.get('scope'), - 'language_type': set_3_data.get('language_type') - } - - return set_1, set_2_bibliographic, set_2_terminologic, set_5 - - -def parse_iso_639_3(file_path): - """Parse ISO-639-3 file which contains Set 3.""" - set_3 = {} - - with open(file_path, 'r', encoding='utf-8') as file: - reader = csv.DictReader(file, delimiter='\t') - for row in reader: - code = row['Id'].strip() - if not code: - continue - - metadata = { - 'name': row['Ref_Name'].strip(), - 'scope': row['Scope'].strip() if row['Scope'].strip() else None, - 'language_type': row['Language_Type'].strip() if row['Language_Type'].strip() else None, - 'part2b': row['Part2b'].strip() if row['Part2b'].strip() else None, - 'part2t': row['Part2t'].strip() if row['Part2t'].strip() else None, - 'part1': row['Part1'].strip() if row['Part1'].strip() else None, - 'comment': row['Comment'].strip() if row['Comment'].strip() else None - } - - set_3[code] = metadata - - return set_3 - - -def build_set_3_lookups(set_3): - """Build lookup tables from Set 3 by Part1, Part2b, and Part2t codes.""" - lookup_by_part1 = {} - lookup_by_part2b = {} - lookup_by_part2t = {} - - for code, metadata in set_3.items(): - if metadata.get('part1'): - lookup_by_part1[metadata['part1']] = { - 'scope': metadata['scope'], - 'language_type': metadata['language_type'] - } - if metadata.get('part2b'): - lookup_by_part2b[metadata['part2b']] = { - 'scope': metadata['scope'], - 'language_type': metadata['language_type'] - } - if metadata.get('part2t'): - lookup_by_part2t[metadata['part2t']] = { - 'scope': metadata['scope'], - 'language_type': metadata['language_type'] - } - - return lookup_by_part1, lookup_by_part2b, lookup_by_part2t - - -def expand_scope(scope_code): - """Expand scope code to full word.""" - if not scope_code: - return None - - scope_map = { - 'I': 'individual', - 'M': 'macrolanguage', - 'S': 'special' - } - - if scope_code not in scope_map: - print(f"Error: Unknown scope code '{scope_code}'", file=sys.stderr) - sys.exit(1) - - return scope_map[scope_code] - - -def expand_language_type(type_code): - """Expand language type code to full word.""" - if not type_code: - return None - - type_map = { - 'L': 'living', - 'E': 'extinct', - 'H': 'historic', - 'C': 'constructed', - 'S': 'special', - 'A': 'ancient' - } - - if type_code not in type_map: - print(f"Error: Unknown language type code '{type_code}'", file=sys.stderr) - sys.exit(1) - - return type_map[type_code] - - -def build_base_schema(): - """Build base schema with required fields.""" - return { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "x-license": "https://github.com/sourcemeta/std/blob/main/LICENSE", - "x-links": ["https://www.iso.org/standard/74575.html"] - } - - -def generate_set_1_schema(set_1, output_dir): - """Generate schema for ISO 639-1 (2-letter codes).""" - schema = build_base_schema() - schema.update({ - "$comment": "Set 1 contains the most common languages (2-letter codes). All Set 1 codes have corresponding codes in Set 2 and Set 3", - "title": "ISO 639-1:2023 Language Code", - "description": "A two-letter language code from ISO 639-1", - "examples": sorted(set_1.keys())[:4], - "anyOf": [ - { - "const": code, - "title": metadata['name'], - **({'x-scope': expand_scope(metadata['scope'])} if metadata.get('scope') else {}), - **({'x-language-type': expand_language_type(metadata['language_type'])} if metadata.get('language_type') else {}), - **({'x-name-french': metadata['name_french']} if metadata.get('name_french') else {}), - **({'x-set-2-bibliographic': metadata['part2b']} if metadata.get('part2b') else {}), - **({'x-set-2-terminologic': metadata['part2t']} if metadata.get('part2t') else {}) - } - for code, metadata in sorted(set_1.items()) - ] - }) - - file_path = os.path.join(output_dir, "set-1.json") - with open(file_path, 'w') as file: - json.dump(schema, file, indent=2) - file.write('\n') - print(f"Generated {file_path} with {len(set_1)} codes") - - -def generate_set_2_bibliographic_schema(set_2_bibliographic, output_dir): - """Generate schema for ISO 639-2 bibliographic codes.""" - schema = build_base_schema() - schema.update({ - "$comment": "Set 2 bibliographic is a superset of Set 1 and a subset of Set 3. Bibliographic codes are based on English language names", - "title": "ISO 639-2:2023 Bibliographic Language Code", - "description": "A three-letter bibliographic language code from ISO 639-2", - "examples": sorted(set_2_bibliographic.keys())[:4], - "anyOf": [ - { - "const": code, - "title": metadata['name'], - **({'x-scope': expand_scope(metadata['scope'])} if metadata.get('scope') else {}), - **({'x-language-type': expand_language_type(metadata['language_type'])} if metadata.get('language_type') else {}), - **({'x-name-french': metadata['name_french']} if metadata.get('name_french') else {}), - **({'x-set-1': metadata['part1']} if metadata.get('part1') else {}) - } - for code, metadata in sorted(set_2_bibliographic.items()) - ] - }) - - file_path = os.path.join(output_dir, "set-2-bibliographic.json") - with open(file_path, 'w') as file: - json.dump(schema, file, indent=2) - file.write('\n') - print(f"Generated {file_path} with {len(set_2_bibliographic)} codes") - - -def generate_set_2_terminologic_schema(set_2_terminologic, output_dir): - """Generate schema for ISO 639-2 terminologic codes.""" - schema = build_base_schema() - schema.update({ - "$comment": "Set 2 terminologic is a superset of Set 1 and a subset of Set 3. Terminologic codes are based on native language names. Only about 20 languages have both bibliographic and terminologic codes", - "title": "ISO 639-2:2023 Terminologic Language Code", - "description": "A three-letter terminologic language code from ISO 639-2", - "examples": sorted(set_2_terminologic.keys())[:4], - "anyOf": [ - { - "const": code, - "title": metadata['name'], - **({'x-scope': expand_scope(metadata['scope'])} if metadata.get('scope') else {}), - **({'x-language-type': expand_language_type(metadata['language_type'])} if metadata.get('language_type') else {}), - **({'x-name-french': metadata['name_french']} if metadata.get('name_french') else {}), - **({'x-set-1': metadata['part1']} if metadata.get('part1') else {}) - } - for code, metadata in sorted(set_2_terminologic.items()) - ] - }) - - file_path = os.path.join(output_dir, "set-2-terminologic.json") - with open(file_path, 'w') as file: - json.dump(schema, file, indent=2) - file.write('\n') - print(f"Generated {file_path} with {len(set_2_terminologic)} codes") - - -def generate_set_3_schema(set_3, output_dir): - """Generate schema for ISO 639-3 (comprehensive language codes).""" - schema = build_base_schema() - schema.update({ - "$comment": "Set 3 is a superset of Set 1 and Set 2. It provides comprehensive coverage of individual languages, macrolanguages, and special codes", - "title": "ISO 639-3:2023 Language Code", - "description": "A three-letter language code from ISO 639-3", - "examples": sorted(set_3.keys())[:4], - "anyOf": [ - { - "const": code, - "title": metadata['name'], - **({'x-scope': expand_scope(metadata['scope'])} if metadata.get('scope') else {}), - **({'x-language-type': expand_language_type(metadata['language_type'])} if metadata.get('language_type') else {}), - **({'x-set-2-bibliographic': metadata['part2b']} if metadata.get('part2b') else {}), - **({'x-set-2-terminologic': metadata['part2t']} if metadata.get('part2t') else {}), - **({'x-set-1': metadata['part1']} if metadata.get('part1') else {}), - **({'$comment': metadata['comment']} if metadata.get('comment') else {}) - } - for code, metadata in sorted(set_3.items()) - ] - }) - - file_path = os.path.join(output_dir, "set-3.json") - with open(file_path, 'w') as file: - json.dump(schema, file, indent=2) - file.write('\n') - print(f"Generated {file_path} with {len(set_3)} codes") - - -def generate_set_5_schema(set_5, output_dir): - """Generate schema for ISO 639-5 (language families and groups).""" - schema = build_base_schema() - schema.update({ - "$comment": "Set 5 codes language families and groups, not individual languages. It is independent from Sets 1-3", - "title": "ISO 639-5:2023 Language Family Code", - "description": "A three-letter code for language families and groups from ISO 639-5", - "examples": sorted(set_5.keys())[:4], - "anyOf": [ - { - "const": code, - "title": metadata['name'], - **({'x-name-french': metadata['name_french']} if metadata.get('name_french') else {}) - } - for code, metadata in sorted(set_5.items()) - ] - }) - - file_path = os.path.join(output_dir, "set-5.json") - with open(file_path, 'w') as file: - json.dump(schema, file, indent=2) - file.write('\n') - print(f"Generated {file_path} with {len(set_5)} codes") - - -def main(): - script_dir = os.path.dirname(os.path.abspath(__file__)) - project_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir))) - - iso_639_2_file = os.path.join(project_root, "vendor", "data", "iso", "language", "ISO-639-2_utf-8.txt") - iso_639_3_file = os.path.join(project_root, "vendor", "data", "iso", "language", "iso-639-3_Code_Tables", "iso-639-3_Code_Tables_20251015", "iso-639-3.tab") - output_dir = os.path.join(project_root, "schemas", "iso", "language", "2023") - - if not os.path.exists(iso_639_2_file): - print(f"Error: Data file not found: {iso_639_2_file}", file=sys.stderr) - sys.exit(1) - - if not os.path.exists(iso_639_3_file): - print(f"Error: Data file not found: {iso_639_3_file}", file=sys.stderr) - sys.exit(1) - - # Parse Set 3 first to build lookup tables - print("Parsing ISO 639-3 data...") - set_3 = parse_iso_639_3(iso_639_3_file) - - print("Building Set 3 lookup tables...") - lookup_by_part1, lookup_by_part2b, lookup_by_part2t = build_set_3_lookups(set_3) - - # Parse ISO 639-2 data with Set 3 lookups - print("Parsing ISO 639-2 data...") - set_1, set_2_bibliographic, set_2_terminologic, set_5 = parse_iso_639_2( - iso_639_2_file, lookup_by_part1, lookup_by_part2b, lookup_by_part2t - ) - - # Create output directory - os.makedirs(output_dir, exist_ok=True) - - # Generate schemas - print("Generating schemas...") - generate_set_1_schema(set_1, output_dir) - generate_set_2_bibliographic_schema(set_2_bibliographic, output_dir) - generate_set_2_terminologic_schema(set_2_terminologic, output_dir) - generate_set_3_schema(set_3, output_dir) - generate_set_5_schema(set_5, output_dir) - - print("Done!") - - -if __name__ == "__main__": - main() diff --git a/scripts/csv2json.py b/scripts/csv2json.py new file mode 100644 index 00000000..91f2c657 --- /dev/null +++ b/scripts/csv2json.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +import csv +import json +import os +import sys +import argparse + + +def csv_to_json(input_file, output_file, delimiter, has_header, encoding, field_names): + """Convert CSV/TSV file to JSON array.""" + rows = [] + + with open(input_file, 'r', encoding=encoding) as file: + if has_header: + reader = csv.DictReader(file, delimiter=delimiter) + for row in reader: + rows.append(row) + elif field_names: + reader = csv.DictReader(file, delimiter=delimiter, fieldnames=field_names) + for row in reader: + rows.append(row) + else: + reader = csv.reader(file, delimiter=delimiter) + for row in reader: + rows.append(row) + + output_directory = os.path.dirname(output_file) + if output_directory: + os.makedirs(output_directory, exist_ok=True) + + with open(output_file, 'w', encoding='utf-8') as file: + json.dump(rows, file, indent=2, ensure_ascii=False) + file.write('\n') + + +def main(): + parser = argparse.ArgumentParser(description='Convert CSV/TSV files to JSON') + parser.add_argument('input', help='Input CSV/TSV file') + parser.add_argument('output', help='Output JSON file') + parser.add_argument('-d', '--delimiter', default=',', help='Field delimiter (default: comma)') + parser.add_argument('--tab', action='store_true', help='Use tab as delimiter (shortcut for -d $\'\\t\')') + parser.add_argument('--no-header', action='store_true', help='First line is not a header (output as array of arrays)') + parser.add_argument('-e', '--encoding', default='utf-8', help='Input file encoding (default: utf-8)') + parser.add_argument('--field-names', help='Comma-separated field names (use with --no-header to create objects)') + + args = parser.parse_args() + + delimiter = '\t' if args.tab else args.delimiter + has_header = not args.no_header + field_names = args.field_names.split(',') if args.field_names else None + + csv_to_json(args.input, args.output, delimiter, has_header, args.encoding, field_names) + + +if __name__ == '__main__': + main() diff --git a/scripts/iso-language-enrich.jq b/scripts/iso-language-enrich.jq new file mode 100644 index 00000000..d4c41ab6 --- /dev/null +++ b/scripts/iso-language-enrich.jq @@ -0,0 +1,86 @@ +# Build lookup tables from Set 3 (ISO-639-3) for enrichment +($iso3[0] | map(select(.Part1 != null and .Part1 != "")) | INDEX(.Part1)) as $lookup_by_part1 | +($iso3[0] | map(select(.Part2b != null and .Part2b != "")) | INDEX(.Part2b)) as $lookup_by_part2b | +($iso3[0] | map(select(.Part2t != null and .Part2t != "")) | INDEX(.Part2t)) as $lookup_by_part2t | + +# Process ISO-639-2 data to extract Sets 1, 2, and 5 +($iso2[0] | map( + . as $entry | + ($entry.name | ascii_downcase | (contains("languages") or contains("language family"))) as $is_language_family | + { + is_family: $is_language_family, + part1: ($entry.part1 // "" | if . == "" then null else . end), + part2b: ($entry.part2b // "" | if . == "" then null else . end), + part2t: ($entry.part2t // "" | if . == "" then null else . end), + name: $entry.name, + name_french: ($entry.name_french // "" | if . == "" then null else . end) + } +)) as $processed_iso2 | + +# Set 5: Language families (identified by name patterns) +($processed_iso2 | map(select(.is_family and .part2b != null) | { + code: .part2b, + name: .name, + name_french: .name_french +})) as $set_5 | + +# Set 1: 2-letter codes (part1) +($processed_iso2 | map(select(.is_family | not) | select(.part1 != null) | + ($lookup_by_part1[.part1] // {}) as $set_3_data | + { + code: .part1, + name: .name, + name_french: .name_french, + part2b: .part2b, + part2t: .part2t, + scope: ($set_3_data.Scope // "" | if . == "" then null else . end), + language_type: ($set_3_data.Language_Type // "" | if . == "" then null else . end) + } +)) as $set_1 | + +# Set 2 bibliographic: 3-letter bibliographic codes (part2b) +($processed_iso2 | map(select(.is_family | not) | select(.part2b != null) | + ($lookup_by_part2b[.part2b] // {}) as $set_3_data | + { + code: .part2b, + name: .name, + name_french: .name_french, + part1: .part1, + scope: ($set_3_data.Scope // "" | if . == "" then null else . end), + language_type: ($set_3_data.Language_Type // "" | if . == "" then null else . end) + } +)) as $set_2_bibliographic | + +# Set 2 terminologic: 3-letter terminologic codes (part2t) +($processed_iso2 | map(select(.is_family | not) | select(.part2t != null) | + ($lookup_by_part2t[.part2t] // {}) as $set_3_data | + { + code: .part2t, + name: .name, + name_french: .name_french, + part1: .part1, + scope: ($set_3_data.Scope // "" | if . == "" then null else . end), + language_type: ($set_3_data.Language_Type // "" | if . == "" then null else . end) + } +)) as $set_2_terminologic | + +# Set 3: All ISO-639-3 codes +($iso3[0] | map(select(.Id != null and .Id != "") | { + code: .Id, + name: .Ref_Name, + scope: (.Scope // "" | if . == "" then null else . end), + language_type: (.Language_Type // "" | if . == "" then null else . end), + part2b: (.Part2b // "" | if . == "" then null else . end), + part2t: (.Part2t // "" | if . == "" then null else . end), + part1: (.Part1 // "" | if . == "" then null else . end), + comment: (.Comment // "" | if . == "" then null else . end) +})) as $set_3 | + +# Output combined structure +{ + set_1: $set_1, + set_2_bibliographic: $set_2_bibliographic, + set_2_terminologic: $set_2_terminologic, + set_3: $set_3, + set_5: $set_5 +} diff --git a/templates/iso/language/2023/set-1.jq b/templates/iso/language/2023/set-1.jq new file mode 100644 index 00000000..fc912195 --- /dev/null +++ b/templates/iso/language/2023/set-1.jq @@ -0,0 +1,41 @@ +# Scope expansion map +def expand_scope: + if . == "I" then "individual" + elif . == "M" then "macrolanguage" + elif . == "S" then "special" + else . end; + +# Language type expansion map +def expand_language_type: + if . == "L" then "living" + elif . == "E" then "extinct" + elif . == "H" then "historic" + elif . == "C" then "constructed" + elif . == "S" then "special" + elif . == "A" then "ancient" + else . end; + +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ISO 639-1:2023 Language Code", + "description": "A two-letter language code from ISO 639-1", + "$comment": "Set 1 contains the most common languages (2-letter codes). All Set 1 codes have corresponding codes in Set 2 and Set 3", + "examples": (.set_1 | sort_by(.code) | .[0:4] | map(.code)), + "x-license": "https://github.com/sourcemeta/std/blob/main/LICENSE", + "x-links": ["https://www.iso.org/standard/74575.html"], + "anyOf": ( + .set_1 + | sort_by(.code) + | map({ + "title": .name + } + + (if .scope then {"x-scope": (.scope | expand_scope)} else {} end) + + (if .language_type then {"x-language-type": (.language_type | expand_language_type)} else {} end) + + (if .name_french then {"x-name-french": .name_french} else {} end) + + (if .part2b then {"x-set-2-bibliographic": .part2b} else {} end) + + (if .part2t then {"x-set-2-terminologic": .part2t} else {} end) + + { + "const": .code + }) + ) +} diff --git a/templates/iso/language/2023/set-2-bibliographic.jq b/templates/iso/language/2023/set-2-bibliographic.jq new file mode 100644 index 00000000..9e957f4a --- /dev/null +++ b/templates/iso/language/2023/set-2-bibliographic.jq @@ -0,0 +1,40 @@ +# Scope expansion map +def expand_scope: + if . == "I" then "individual" + elif . == "M" then "macrolanguage" + elif . == "S" then "special" + else . end; + +# Language type expansion map +def expand_language_type: + if . == "L" then "living" + elif . == "E" then "extinct" + elif . == "H" then "historic" + elif . == "C" then "constructed" + elif . == "S" then "special" + elif . == "A" then "ancient" + else . end; + +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ISO 639-2:2023 Bibliographic Language Code", + "description": "A three-letter bibliographic language code from ISO 639-2", + "$comment": "Set 2 bibliographic is a superset of Set 1 and a subset of Set 3. Bibliographic codes are based on English language names", + "examples": (.set_2_bibliographic | sort_by(.code) | .[0:4] | map(.code)), + "x-license": "https://github.com/sourcemeta/std/blob/main/LICENSE", + "x-links": ["https://www.iso.org/standard/74575.html"], + "anyOf": ( + .set_2_bibliographic + | sort_by(.code) + | map({ + "title": .name + } + + (if .scope then {"x-scope": (.scope | expand_scope)} else {} end) + + (if .language_type then {"x-language-type": (.language_type | expand_language_type)} else {} end) + + (if .name_french then {"x-name-french": .name_french} else {} end) + + (if .part1 then {"x-set-1": .part1} else {} end) + + { + "const": .code + }) + ) +} diff --git a/templates/iso/language/2023/set-2-terminologic.jq b/templates/iso/language/2023/set-2-terminologic.jq new file mode 100644 index 00000000..ca715ace --- /dev/null +++ b/templates/iso/language/2023/set-2-terminologic.jq @@ -0,0 +1,40 @@ +# Scope expansion map +def expand_scope: + if . == "I" then "individual" + elif . == "M" then "macrolanguage" + elif . == "S" then "special" + else . end; + +# Language type expansion map +def expand_language_type: + if . == "L" then "living" + elif . == "E" then "extinct" + elif . == "H" then "historic" + elif . == "C" then "constructed" + elif . == "S" then "special" + elif . == "A" then "ancient" + else . end; + +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ISO 639-2:2023 Terminologic Language Code", + "description": "A three-letter terminologic language code from ISO 639-2", + "$comment": "Set 2 terminologic is a superset of Set 1 and a subset of Set 3. Terminologic codes are based on native language names. Only about 20 languages have both bibliographic and terminologic codes", + "examples": (.set_2_terminologic | sort_by(.code) | .[0:4] | map(.code)), + "x-license": "https://github.com/sourcemeta/std/blob/main/LICENSE", + "x-links": ["https://www.iso.org/standard/74575.html"], + "anyOf": ( + .set_2_terminologic + | sort_by(.code) + | map({ + "title": .name + } + + (if .scope then {"x-scope": (.scope | expand_scope)} else {} end) + + (if .language_type then {"x-language-type": (.language_type | expand_language_type)} else {} end) + + (if .name_french then {"x-name-french": .name_french} else {} end) + + (if .part1 then {"x-set-1": .part1} else {} end) + + { + "const": .code + }) + ) +} diff --git a/templates/iso/language/2023/set-3.jq b/templates/iso/language/2023/set-3.jq new file mode 100644 index 00000000..1b2cd151 --- /dev/null +++ b/templates/iso/language/2023/set-3.jq @@ -0,0 +1,42 @@ +# Scope expansion map +def expand_scope: + if . == "I" then "individual" + elif . == "M" then "macrolanguage" + elif . == "S" then "special" + else . end; + +# Language type expansion map +def expand_language_type: + if . == "L" then "living" + elif . == "E" then "extinct" + elif . == "H" then "historic" + elif . == "C" then "constructed" + elif . == "S" then "special" + elif . == "A" then "ancient" + else . end; + +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ISO 639-3:2023 Language Code", + "description": "A three-letter language code from ISO 639-3", + "$comment": "Set 3 is a superset of Set 1 and Set 2. It provides comprehensive coverage of individual languages, macrolanguages, and special codes", + "examples": (.set_3 | sort_by(.code) | .[0:4] | map(.code)), + "x-license": "https://github.com/sourcemeta/std/blob/main/LICENSE", + "x-links": ["https://www.iso.org/standard/74575.html"], + "anyOf": ( + .set_3 + | sort_by(.code) + | map({ + "title": .name + } + + (if .scope then {"x-scope": (.scope | expand_scope)} else {} end) + + (if .language_type then {"x-language-type": (.language_type | expand_language_type)} else {} end) + + (if .part2b then {"x-set-2-bibliographic": .part2b} else {} end) + + (if .part2t then {"x-set-2-terminologic": .part2t} else {} end) + + (if .part1 then {"x-set-1": .part1} else {} end) + + (if .comment then {"$comment": .comment} else {} end) + + { + "const": .code + }) + ) +} diff --git a/templates/iso/language/2023/set-5.jq b/templates/iso/language/2023/set-5.jq new file mode 100644 index 00000000..97ead6a9 --- /dev/null +++ b/templates/iso/language/2023/set-5.jq @@ -0,0 +1,20 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ISO 639-5:2023 Language Family Code", + "description": "A three-letter code for language families and groups from ISO 639-5", + "$comment": "Set 5 codes language families and groups, not individual languages. It is independent from Sets 1-3", + "examples": (.set_5 | sort_by(.code) | .[0:4] | map(.code)), + "x-license": "https://github.com/sourcemeta/std/blob/main/LICENSE", + "x-links": ["https://www.iso.org/standard/74575.html"], + "anyOf": ( + .set_5 + | sort_by(.code) + | map({ + "title": .name + } + + (if .name_french then {"x-name-french": .name_french} else {} end) + + { + "const": .code + }) + ) +}