1:- module(pinyin, [
    2  word//2,
    3  num_dia/2]).

Parsing and generation of Hanyu Pinyin

This module implements a grammar that parses and generates words written in Hanyu Pinyin, the standard romanization system for Mandarin Chinese. It also provides a utility to convert whole texts between diacritics and numbers for writing tones.

   14:- use_module(casing, [
   15    casing/3]).   16
   17:- set_prolog_flag(double_quotes, codes). % SWI-7 ready
   18
   19% TODO support v for ü?
   20% TODO support tone number 0/5?
   21% TODO What about "Harbin"?

num_dia(?Num, ?Dia)

Converts between Pinyin with diacritics and numbers as tone marks. Num and Dia are code lists.

Maximal substrings of Pinyin letters, the numbers 1-4 as well as the characters ' and - are converted if they can be parsed as lower-case, capitalized or all-caps Pinyin words in the input format, everything else is left alone. Case is preserved.

The assumed input format is diacritics if Num is variable, numbers otherwise.

Example usage:

?- set_prolog_flag(double_quotes, codes).
true.

?- num_dia("Wo3 xian4zai4 dui4 jing1ju4 hen3 gan3 xing4qu4.",
|    Codes), atom_codes(Atom, Codes).
Codes = [87, 466, 32, 120, 105, 224, 110, 122, 224|...],
Atom = 'Wǒ xiànzài duì jīngjù hěn gǎn xìngqù.'.

?- num_dia(Codes, "Nǐ ne?"), atom_codes(Atom, Codes).
Codes = [78, 105, 51, 32, 110, 101, 63],
Atom = 'Ni3 ne?'.

   51num_dia(Num, Dia) :-
   52  var(Num),
   53  !,
   54  convert(dia, num, Dia, Num).
   55num_dia(Num, Dia) :-
   56  convert(num, dia, Num, Dia).
   57
   58% PINYIN WORD GRAMMAR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

word(?Morphs, ?ND)//

A DCG that parses or generates a single word written in Hanyu Pinyin.

Morphs is a list of "morphs" (not in the strictest linguistic sense) that make up the word. They take one of three forms:

Initial-Final-Tone where Initial and Final are atoms. The final takes the "underlying" form, which may be different from the written form. Tone is one of the integers from 0 to 4.
r (for the erhuayin suffix)
- (for word-internal hyphens)

ND is either num or dia, depending on how tones are represented (numbers or diacritics).

The grammar implements the following tricky aspects of Hanyu Pinyin:

placement of w/y before i/u/ü as required
abbreviation of triphthongs into two vowel letters as required
change of ü to u after j/q/x/y
placement of tone marks
placement of apostrophes between syllables as required
the possibility of word-internal hyphens
the possibility of erhuayin, placing r at the end of a word

The following aspects are currently not supported:

Case. This DCG only handles lower-case letters.
The spelling variant where ü is written as v.
The spelling variant where the neutral tone is written as 0 or 5.
Phonological restrictions. Not every initial, final and tone combine into a syllable in Mandarin. This grammar, however, will happily parse and generate any combination. It only enforces that the erhuayin suffix not appear after the e final, and that j/x/q not be followed by an u sound, because either would make the grammar ambiguous.
Dropping apostrophes when using numbers instead of tone marks.

Example usage:

?- set_prolog_flag(double_quotes, codes).
true.

?- phrase(word([n-ü-3, ''-er-2], ND), Codes), atom_codes(Atom, Codes).
ND = dia,
Codes = [110, 474, 39, 233, 114],
Atom = 'nǚ\'ér' ;
ND = num,
Codes = [110, 252, 51, 39, 101, 114, 50],
Atom = 'nü3\'er2' ;
false.

?- phrase(word(Morphs, ND), "yìjué").
Morphs = [''-i-4, j-üe-2],
ND = dia ;
false.

  120word([Morph], ND) -->
  121  syll1(ND, Morph, _).
  122% erhuayin
  123word([Morph, r], ND) -->
  124  syll1(ND, Morph, _),
  125  { Morph \= _-e-_
  126  },
  127  "r".
  128word([Morph|Morphs], ND) -->
  129  syll1(ND, Morph, _),
  130  sylls(ND, Morphs).
  131word([''-Final-Tone], ND) -->
  132  syllabic_nasal(ND, Final, Tone).
  133
  134% initial syllable
  135syll1(ND, ''-Final-Tone, RequiresApostrophe) -->
  136  bare_final(ND, Final, Tone, RequiresApostrophe).
  137syll1(ND, Initial-Final-Tone, no) -->
  138  nonempty_initial(Initial),
  139  { requires_modification(Initial, RequiresModification)
  140  },
  141  nonbare_final(ND, RequiresModification, Final, Tone).
  142
  143% sequence of non-initial syllables
  144sylls(ND, [Morph]) -->
  145  syll(ND, Morph).
  146% erhuayin
  147sylls(ND, [Morph, r]) -->
  148  syll(ND, Morph),
  149  { Morph \= _-e-_
  150  },
  151  "r".
  152sylls(ND, [Morph|Morphs]) -->
  153  syll(ND, Morph),
  154  sylls(ND, Morphs).
  155
  156% non-initial syllable
  157syll(ND, [-|Morph]) -->
  158  "-",
  159  syll1(ND, Morph).
  160syll(ND, Initial-Final-Tone) -->
  161  syll1(ND, Initial-Final-Tone, no).
  162syll(ND, Initial-Final-Tone) -->
  163  "'",
  164  syll1(ND, Initial-Final-Tone, yes).
  165
  166% GRAPHEMIC TRANSFORMATIONS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

render_final_bare(+FinalUnderlying, -FinalSurface): Converts a final from underlying to written form for an empty initial.
Introduce y resp. w before [iuü]. Also delete [iu] if another vowel follows.

  173render_final_bare([Glide, Vowel|Rest],[Substitute, Vowel|Rest]) :-
  174  member(Glide, "iu"),
  175  vowel_letter(Vowel),
  176  !,
  177  glide_substitute([Glide], [Substitute]).
  178render_final_bare([Glide|Rest], [Substitute, ModifiedGlide|Rest]) :-
  179  [Glide] = "ü",
  180  !,
  181  [Substitute, ModifiedGlide] = "yu".
  182render_final_bare([Glide|Rest], [Substitute, Glide|Rest]) :-
  183  member(Glide, "iuü"),
  184  !,
  185  glide_substitute([Glide], [Substitute]).
  186% Otherwise, written form is identical to underlying.
  187render_final_bare(Final, Final).

render_final_nonbare(+JXQ, +FinalUnderlying, -FinalSurface): Converts a final from underlying to written form for a nonempty initial.
Three finals drop a vowel in spelling after a non-empty initial. If JXQ==yes, ü changes to u.

  195render_final_nonbare(_, "uei", "ui") :-
  196  !.
  197render_final_nonbare(_, "iou", "iu") :-
  198  !.
  199render_final_nonbare(_, "uen", "un") :-
  200  !.
  201% After j, x, q, render ü as u:
  202render_final_nonbare(yes, [Glide|Rest], [ModifiedGlide|Rest]) :-
  203  [Glide] = "ü",
  204  !,
  205  [ModifiedGlide] = "u".
  206% Disallow u after j, x, q:
  207render_final_nonbare(yes, [Glide|_], _) :-
  208  [Glide] = "u",
  209  !,
  210  fail.
  211render_final_nonbare(_, Final, Final).

tonalize(+RenderedFinal, +Tone, ?ND, -Tonalized): Adds a tone diacritic or number to a rendered final.

  215tonalize(Final, Tone, dia, Tonalized) :-
  216  add_tone_mark(Final, Tone, Tonalized).
  217tonalize(Final, 0, num, Final) :-
  218  !.
  219tonalize(Final, Tone, num, Tonalized) :-
  220  atom_codes(Tone, ToneCodes),
  221  append(Final, ToneCodes, Tonalized).
  222
  223% add tone mark to a final
  224add_tone_mark("m", Tone, [Tonal]) :-
  225  !, 
  226  tonal(Tone, "m", [Tonal]). % no vowel - place mark on m
  227add_tone_mark("n", Tone, [Tonal]) :-
  228  !, 
  229  tonal(Tone, "n", [Tonal]). % no vowel - place mark on n
  230add_tone_mark("ng", Tone, [Tonal|"g"]) :-
  231  !, 
  232  tonal(Tone, "n", [Tonal]). % no vowel - place mark on n
  233add_tone_mark([First|Rest], Tone, [Tonal|Rest]) :-
  234  member(First, "aeo"),  % these three greedily attract the tone mark
  235  !, 
  236  tonal(Tone, [First], [Tonal]).
  237add_tone_mark([First, Vowel|Rest], Tone, [First|Tonal]) :-
  238  vowel_letter(Vowel),  % otherwise,  do not yet place the mark if another vowel follows 
  239  !, 
  240  add_tone_mark([Vowel|Rest], Tone, Tonal).
  241add_tone_mark([First|Rest], Tone, [Tonal|Rest]) :-
  242  vowel_letter(First),  % last vowel - last chance to place the mark
  243  !, 
  244  tonal(Tone, [First], [Tonal]).
  245
  246% GRAPH/PHONE PROPERTIES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  247
  248% Indicates whether a final must be preceded by an apostrophe if it is bare (no
  249% initial preceding) and non-initial.
  250requires_apostrophe([First|_], yes) :-
  251  member(First, "aeo"),
  252  !.
  253requires_apostrophe(_, no).
  254
  255% Indicates whether ü changes to u after the given initial.
  256requires_modification(j, yes) :-
  257  !.
  258requires_modification(x, yes) :-
  259  !.
  260requires_modification(q, yes) :-
  261  !.
  262requires_modification(_, no).
  263
  264vowel_letter(X) :-
  265  member(X, "aeiouü").
  266
  267glide_substitute("i", "y").
  268glide_substitute("u", "w").
  269glide_substitute("ü", "y").
  270
  271% map between letters with and without tone marks
  272% TODO m only available with 0 and 2 in Unicode, are others not needed for
  273% Mandarin? Otherwise we have to use combining characters and modify
  274% add_tone_mark/3.
  275tonal(0, "a", "a").
  276tonal(0, "e", "e").
  277tonal(0, "i", "i").
  278tonal(0, "o", "o").
  279tonal(0, "u", "u").
  280tonal(0, "ü", "ü").
  281tonal(0, "m", "m").
  282tonal(0, "n", "n").
  283tonal(1, "a", "ā").
  284tonal(1, "e", "ē").
  285tonal(1, "i", "ī").
  286tonal(1, "o", "ō").
  287tonal(1, "u", "ū").
  288tonal(1, "ü", "ǖ").
  289tonal(1, "n", "ñ").
  290tonal(2, "a", "á").
  291tonal(2, "e", "é").
  292tonal(2, "i", "í").
  293tonal(2, "o", "ó").
  294tonal(2, "u", "ú").
  295tonal(2, "ü", "ǘ").
  296tonal(2, "m", "ḿ").
  297tonal(2, "n", "ń").
  298tonal(3, "a", "ǎ").
  299tonal(3, "e", "ě").
  300tonal(3, "i", "ǐ").
  301tonal(3, "o", "ǒ").
  302tonal(3, "u", "ǔ").
  303tonal(3, "ü", "ǚ").
  304tonal(3, "n", "ň").
  305tonal(4, "a", "à").
  306tonal(4, "e", "è").
  307tonal(4, "i", "ì").
  308tonal(4, "o", "ò").
  309tonal(4, "u", "ù").
  310tonal(4, "ü", "ǜ").
  311tonal(4, "n", "ǹ").
  312
  313% Inventory of tones.
  314tone(0).
  315tone(1).
  316tone(2).
  317tone(3).
  318tone(4).
  319
  320% TERM EXPANSION RULES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  321
  322% Generate terminal DCG rules for initials.
  323term_expansion(nonempty_initial(Initial),
  324    (nonempty_initial(InitialAtom) --> Initial)) :-
  325  atom_codes(InitialAtom, Initial).
  326% Generate terminal DCG rules for each final, depending on the final, whether
  327% it follows a nonempty initial ("nonbare"), which tone it carries and how
  328% tones are written (diacritic or number).
  329term_expansion(final(Final), Rules) :-
  330  findall(
  331      ( bare_final(ND, FinalAtom, Tone, RequiresApostrophe) --> String ),
  332      ( render_final_bare(Final, Rendered),
  333        requires_apostrophe(Final, RequiresApostrophe),
  334        member(ND, [dia, num]),
  335        tone(Tone),
  336        tonalize(Rendered, Tone, ND, String),
  337        atom_codes(FinalAtom, Final)
  338      ),
  339      BareRules),
  340  findall(
  341      ( nonbare_final(ND, JXQ, FinalAtom, Tone) --> String ),
  342      ( member(JXQ, [yes, no]),
  343        render_final_nonbare(JXQ, Final, Rendered),
  344        member(ND, [dia, num]),
  345        tone(Tone),
  346        tonalize(Rendered, Tone, ND, String),
  347        atom_codes(FinalAtom, Final)
  348      ),
  349      NonbareRules),
  350  append(BareRules, NonbareRules, Rules0),
  351  % This translation should happen automatically, but doesn't:
  352  maplist(dcg_translate_rule, Rules0, Rules).
  353% Generate terminal DCG rules for syllabic nasals.
  354term_expansion(syllabic_nasal(Final), Rules) :-
  355  findall(
  356      ( syllabic_nasal(ND, FinalAtom, Tone) --> String ),
  357      ( member(ND, [dia, num]),
  358        tone(Tone),
  359        tonalize(Final, Tone, ND, String),
  360        atom_codes(FinalAtom, Final)
  361      ),
  362      Rules0),
  363  maplist(dcg_translate_rule, Rules0, Rules).
  364
  365% PHONE INVENTORIES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  366% These are translated to DCG rules so are not available in this form in the
  367% compiled program.
  368
  369:- discontiguous bare_final//4.  370:- discontiguous nonbare_final//4.  371
  372% Inventory of initials.
  373nonempty_initial("b").
  374nonempty_initial("p").
  375nonempty_initial("m").
  376nonempty_initial("f").
  377nonempty_initial("d").
  378nonempty_initial("t").
  379nonempty_initial("n").
  380nonempty_initial("l").
  381nonempty_initial("g").
  382nonempty_initial("k").
  383nonempty_initial("h").
  384nonempty_initial("j").
  385nonempty_initial("q").
  386nonempty_initial("x").
  387nonempty_initial("zh").
  388nonempty_initial("ch").
  389nonempty_initial("sh").
  390nonempty_initial("r").
  391nonempty_initial("z").
  392nonempty_initial("c").
  393nonempty_initial("s").
  394
  395% Inventory of finals in canonical form. Actual written form depends on
  396% initial.
  397final("i").
  398final("u").
  399final("ü").
  400final("a").
  401final("ia").
  402final("ua").
  403final("o").
  404final("io"). % occurs in interjections
  405final("uo").
  406final("e").
  407final("ie").
  408final("üe").
  409final("er").
  410final("ai").
  411final("uai").
  412final("ei").
  413final("uei").
  414final("ao").
  415final("iao").
  416final("ou").
  417final("iou").
  418final("an").
  419final("ian").
  420final("uan").
  421final("üan").
  422final("en").
  423final("in").
  424final("uen").
  425final("ün").
  426final("ang").
  427final("iang").
  428final("uang").
  429final("eng").
  430final("ing").
  431final("ueng").
  432final("ong").
  433final("iong").
  434
  435% Inventory of syllabic nasals.
  436syllabic_nasal("m").
  437syllabic_nasal("n").
  438syllabic_nasal("ng").
  439
  440% CONVERSION BETWEEN NUMBERS AND DIACRITICS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  441
  442convert(InputFormat, OutputFormat, Input, Output) :-
  443  phrase(l(InputSegments), Input),
  444  maplist(convert_segment(InputFormat, OutputFormat), InputSegments,
  445      OutputSegments),
  446  append(OutputSegments, Output).
  447
  448convert_segment(_, _, o(C), [C]) :-
  449  !.
  450convert_segment(InputFormat, OutputFormat, w(InputWord), OutputWord) :-
  451  casing(InputWord, UncasedInputWord, CasePattern),
  452  phrase(word(Morphs, InputFormat), UncasedInputWord),
  453  phrase(word(Morphs, OutputFormat), UncasedOutputWord),
  454  casing(OutputWord, UncasedOutputWord, CasePattern),
  455  !.
  456% word does not conform to rules, we leave it alone
  457convert_segment(_, _, w(Word), Word).
  458
  459% TEXT PARSING %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  460% A mini-DCG that parses a string into a list of words and intervening non-word
  461% characters.
  462
  463% string starting with word
  464l([w(W)|L]) -->
  465  w(W),
  466  !,
  467  l(L).
  468% string starting with non-word character
  469l([o(C)|L]) -->
  470  [C],
  471  !,
  472  l(L).
  473% empty string
  474l([]) -->
  475  "".
  476
  477% word with more than one character
  478w([C|W]) -->
  479  c(C),
  480  w(W),
  481  !.
  482% word with one character
  483w([C]) -->
  484  c(C).
  485
  486% word character
  487c(C) -->
  488  [C],
  489  { member(C, "AÃÁǍÀBCDEĒÉĚÈFGHIĪÍǏÌJKLMNOŌÓǑÒPQRSTUŪÚǓÙÜǕǗǙǛWXYZaãáǎàbcdeēéěèfghiīíǐìjklmḿnñńňǹoōóǒòpqrstuūúǔùüǖǘǚǜwxyz1234'-")
  490  }