1/* 
    21st and 2nd-order Hidden Markov model for part-of-speech tagging.
    3This program differs from http://cplint.lamping.unife.it/example/inference/hmmpos.pl because 
    41. a 1st-order HMM and a 2nd-order HMM are included
    52. the probabilistic predicates trans/3, trans2/4, out/3 and out2/4 are defined
    6intensionally 
    73. the probability values are defined on the basis of frequency data from a 
    8(toy in this example) dataset
    9The states represent parts-of-speech, and the symbols emitted by the states are words.
   10In the 1st-order HMM, a word depends probabilistically on its own 
   11part-of-speech (i.e. its tag) which in turn depends on the part-of-speech of 
   12the preceding word (or on the start state in case there is no preceding word).
   13In the 2nd-order HMM, a word depends probabilistically on its own 
   14part-of-speech and the preceding tag which in turn depends on the 
   15part-of-speech of the two preceding words (or on the start state in case there 
   16are no preceding words).
   17From
   18http://stp.lingfil.uu.se/~nivre/docs/thesis3.pdf
   19Original program by Joakim Nivre and Torbjorn Lager, adapted to MCINTYRE by Fabrizio Riguzzi
   20*/

?- mc_sample_arg(hmm(S,['I',can,can,a,can]),1000,S,O). % sample the state sequence corresonding to the phrase "I can can a can" % the most frequent state sequence is an approximate POS tagging for the % sentence. It corresponds to the Viterbi path of the HMM. % expected result: the most frequent tagging should be [pn,vb,vb,dt,nn] ?- mc_sample_arg(hmm2(S,['I',can,can,a,can]),1000,S,O). % as above but for the second order model % ?- mc_sample_arg(hmm(S,['I',can,can,a,can]),1000,S,O),argbar(O,C). ?- mc_sample_arg(hmm2(S,['I',can,can,a,can]),1000,S,O),argbar(O,C). ?- mc_sample_arg(hmm(S,[can, the ,can, do, the, can ,can]),10000,S,O),argbar(O,C). % example by Douglas R. Miles ?- mc_sample_arg(hmm(S,[can, the ,can, do, the, can ,can]),10000,S,O),argbar(O,C). */

   38:- use_module(library(mcintyre)).   39
   40:- if(current_predicate(use_rendering/1)).   41:- use_rendering(c3).   42:- use_rendering(graphviz).   43:- endif.   44
   45:- mc.   46
   47:- begin_lpad.   48
   49% hmm(O): O is the output sequence 
   50% hmm(S,O): O is the output sequence and S is the sequence of states
   51% hmm(Q,S0,S,O):  from state Q and previous state S0, generates output O and
   52% sequence of states S
   53
   54hmm(O):-hmm(_,O).
   55% O is an output sequence if there is a state seuqnece S such that hmm(S,O) 
   56% holds
   57
   58hmm(S,O):-trans(start,Q0,[]),hmm(Q0,[],S0,O),reverse(S0,S).
   59% O is an output sequence and S a state sequence if the chain stats at state
   60% start and ends generating state sequence S and output sequence O
   61
   62hmm(Q,S0,S,[L|O]):-
   63	trans(Q,Q1,S0),
   64	out(Q,L,S0),
   65	hmm(Q1,[Q|S0],S,O).
   66% an HMM in state Q goes in state Q1, emits the word L 
   67% and continues the chain
   68
   69hmm(_,S,S,[]).
   70% an HMM in sny state terminates the sequence without emitting any symbol
   71
   72% hmm2(O): O is the output sequence 
   73% hmm2(S,O): O is the output sequence and S is the sequence of states
   74% hmm2(Q,S0,S1,S,O):  from state Q and previous state S1, generates output O and
   75% sequence of states S
   76
   77hmm2(O):-hmm2(_,O).
   78% O is an output sequence if there is a state seuqnece S such that hmm1(S,O) 
   79% holds
   80
   81hmm2(S,O):-trans2(start,start,Q0,[]),hmm2(start,Q0,[],S0,O),reverse(S0,S).
   82% O is an output sequence and S a state sequence if the chain stats at state
   83% q1 and ends generating state sequence S and output sequence O
   84
   85hmm2(Q0,Q,S0,S,[L|O]):-
   86	trans2(Q0,Q,Q1,S0),
   87	out2(Q0,Q,L,S0),
   88	hmm2(Q,Q1,[Q|S0],S,O).
   89% an HMM in state Q goes in state Q1, emits the word L 
   90% and continues the chain
   91
   92hmm2(_,_,S,S,[]).
   93% an HMM in sny state terminates the sequence without emitting any symbol
   94
   95trans(S0,S1,H):-
   96  findall((S,P),pc_c(S,S0,P),L),
   97  append(L0,[(LastS,_P)],L),
   98  foldl(pick_next_state(S0,H),L0,(1,_),(_,S1)),
   99  (var(S1)->  
  100    S1=LastS
  101  ;
  102    true
  103  ).
  104
  105pick_next_state(_S0,_H,_L,(P0,V0),(P0,V0)):-
  106  nonvar(V0).
  107
  108pick_next_state(S0,H,(S,P),(P0,V0),(P1,V1)):-
  109  var(V0),
  110  PF is P/P0,
  111  (prob_fact_state(S0,S,H,PF)->
  112    P1=PF,
  113    V1=S
  114  ;
  115    P1 is P0*(1-PF),
  116    V1=V0
  117  ).
  118
  119prob_fact_state(_,_,_,P):P.
  120
  121out(S0,W,H):-
  122  findall((W,P),pw_c(W,S0,P),L),
  123  append(L0,[(LastW,_P)],L),
  124  foldl(pick_word(S0,H),L0,(1,_),(_,W)),
  125  (var(W)->  
  126    W=LastW
  127  ;
  128    true
  129  ).
  130
  131pick_word(_S0,_H,_L,(P0,V0),(P0,V0)):-
  132  nonvar(V0).
  133
  134pick_word(S0,H,(W,P),(P0,V0),(P1,V1)):-
  135  var(V0),
  136  PF is P/P0,
  137  (prob_fact_word(S0,W,H,PF)->
  138    P1=PF,
  139    V1=W
  140  ;
  141    P1 is P0*(1-PF),
  142    V1=V0
  143  ).
  144
  145prob_fact_word(_,_,_,P):P.
  146
  147trans2(S0,S1,S2,H):-
  148  findall((S,P),pc_cc(S,S0,S1,P),L),
  149  append(L0,[(LastS,_P)],L),
  150  foldl(pick_next_state2(S0,S1,H),L0,(1,_),(_,S2)),
  151  (var(S2)->  
  152    S2=LastS
  153  ;
  154    true
  155  ).
  156
  157pick_next_state2(_S0,_S1,_H,_L,(P0,V0),(P0,V0)):-
  158  nonvar(V0).
  159
  160pick_next_state2(S0,S1,H,(S,P),(P0,V0),(P1,V1)):-
  161  var(V0),
  162  PF is P/P0,
  163  (prob_fact_state2(S0,S1,S,H,PF)->
  164    P1=PF,
  165    V1=S
  166  ;
  167    P1 is P0*(1-PF),
  168    V1=V0
  169  ).
  170
  171prob_fact_state2(_,_,_,_,P):P.
  172
  173out2(S0,S1,W,H):-
  174  findall((W,P),pw_cc(W,S0,S1,P),L),
  175  append(L0,[(LastW,_P)],L),
  176  foldl(pick_word2(S0,S1,H),L0,(1,_),(_,W)),
  177  (var(W)->  
  178    W=LastW
  179  ;
  180    true
  181  ).
  182
  183pick_word2(_S0,_S1,_H,_L,(P0,V0),(P0,V0)):-
  184  nonvar(V0).
  185
  186pick_word2(S0,S1,H,(W,P),(P0,V0),(P1,V1)):-
  187  var(V0),
  188  PF is P/P0,
  189  (prob_fact_word2(S0,S1,W,H,PF)->
  190    P1=PF,
  191    V1=W
  192  ;
  193    P1 is P0*(1-PF),
  194    V1=V0
  195  ).
  196
  197prob_fact_word2(_,_,_,_,P):P.
  198
  199
  200
  201:- end_lpad.  202
  203
  204%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  205%
  206% con.mle.pl
  207%
  208% MLE CONTEXTUAL MODEL
  209%
  210% Maximum likelihood estimation of contextual probabilities
  211% P(c), P(c2|c1), P(c3|c1,c2).
  212%
  213% The definitions presuppose that the following
  214% frequencies are stored in the internal database:
  215%
  216% fc(C,F) - Frequency of class C
  217% fcc(C1,C2,F) - Joint frequency of classes C1 and C2
  218% fccc(C1,C2,C3) - Joint frequency of classes C1, C2 and C3
  219%
  220% NB: fc/2, fcc/3 and fccc/4 must also be defined for the dummy
  221% class start used to initialize the tagger. Thus, the
  222% following must be defined for all classes C1 and C2:
  223%
  224% fc(start,F).
  225% fcc(start,start,F).
  226% fcc(start,C1,F).
  227% fccc(start,start,C1,F).
  228% fccc(start,C1,C2,F).
  229%
  230%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  231% P(c)
  232pc(C,P) :-
  233  fc(C,F),
  234  tokens(N),
  235  P is F/N.
  236
  237% P(c2|c1)
  238pc_c(C,start,P):-!,
  239  fc(C,F1),
  240  tokens(F2),
  241  P is F1/F2.
  242
  243pc_c(C2,C1,P) :-
  244  fcc(C1,C2,F1),
  245  fc(C1,F2),
  246  P is F1/F2.
  247
  248pc_c(C2,C1,0) :-
  249  \+ fcc(C1,C2,_).
  250
  251pc_cc(C,start,start,P):-!,
  252  pc_c(C,start,P).
  253
  254pc_cc(C,start,C1,P):-!,
  255  pc_c(C,C1,P).
  256
  257
  258% P(c3|c1,c2)
  259pc_cc(C3,C1,C2,P) :-
  260  fccc(C1,C2,C3,F1),
  261  fcc(C1,C2,F2),
  262  P is F1/F2.
  263
  264pc_cc(C3,C1,C2,0) :-
  265  \+ fccc(C1,C2,C3,_).
  266  
  267%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  268%
  269% lex.mle.pl
  270%
  271% MLE LEXICAL MODEL
  272%
  273% Maximum likelihood estimation of lexical probabilities P(w|c).
  274% The definition presupposes that the following
  275% frequencies are stored in the internal database:
  276%
  277% fwc(W,C,F) - Joint frequency of word W and class C
  278% fc(C,F) - Frequency of class C
  279%
  280% It also presupposes that the set of open classes (i.e. classes
  281% allowed for unknown words) is defined by means of clauses of
  282% the form:
  283%
  284% open(C)
  285%
  286%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  287/*pw_c(W,C,P) :-
  288  fwc(W,C,F1),
  289  fc(C,F2),
  290  P is F1/F2.
  291pw_c(W,C,0) :-
  292  \+ fwc(W,_,_),
  293  open(C).
  294*/
  295%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  296%
  297% lex.uni.pl
  298%
  299% UNIFORM SMOOTHING OF LEXICAL MODEL
  300%
  301% For known words, the lexical probability is a standard
  302% maximum likelihood estimate:
  303%
  304% P(w|c) = f(w,c) / f(c)
  305%
  306% except that f(c) is adjusted by adding f(c)/n for open
  307% word classes.
  308%
  309% For unknown words, the lexical probability is 1/n
  310% for all open classes, where n is the number of tokens
  311% in the training corpus.
  312%
  313% The definition presupposes that the following
  314% frequencies are stored in the internal database:
  315%
  316% fwc(W,C,F) - Joint frequency of word W and class C
  317% fc(C,F) - Frequency of class C
  318%
  319% It also presupposes that the number of open classes
  320% (i.e. those allowed for unknown words) are defined by
  321% clauses of the following form:
  322%
  323% open(C)
  324%
  325% Finally, it presupposes that the total number of tokens
  326% in the training corpus is defined:
  327%
  328% tokens(N)
  329%
  330%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  331pw_c(W,C,P) :-
  332  fwc(W,C,F1),
  333  fc(C,F2),
  334  \+ open(C),
  335  P is F1/F2.
  336
  337pw_c(W,C,P) :-
  338  fwc(W,C,F1),
  339  fc(C,F2),
  340  open(C),
  341  tokens(N),
  342  P is F1/(F2+(F2/N)).
  343
  344pw_c(W,C,P) :-
  345  \+ fwc(W,_,_),
  346  open(C),
  347  tokens(N),
  348  P is 1/N.
  349
  350pw_cc(W,C1,C2,P) :-
  351  fwcc(W,C1,C2,F1),
  352  hfreq(F),
  353  F1 > F,
  354  fcc(C1,C2,F2),
  355  P is F1/F2.
  356
  357pw_cc(W,C1,C2,P) :-
  358  pw_c(W,C2,P),
  359%  fc(C1,_),
  360  \+ (fwcc(W,C1,C2,F1), hfreq(F), F1 > F).
  361
  362open(ab).
  363open(in).
  364open(jj).
  365open(nn).
  366open(pc).
  367open(p).
  368open(rg).
  369open(uo).
  370open(vb).
  371
  372hfreq(0).
  373
  374/* Corpus
  375I/pn can/vb light/vb a/dt fire/nn and/cn you/pn
  376can/vb open/vb a/dt can/nn of/pp beans/nn ./dl
  377Now/ab the/dt can/nn is/vb open/jj ,/dl and/cn
  378we/pn can/vb eat/vb in/pp the/dt light/nn of/pp
  379the/dt fire/nn ./dl
  380
  381Frequency Databases
  382*/
  383fwc('.', dl, 2).
  384fwc(',', dl, 1).
  385fwc(a, dt, 2).
  386fwc(and, cn, 2).
  387fwc(beans, nn, 1).
  388fwc(can, nn, 2).
  389fwc(can, vb, 3).
  390fwc(eat, vb, 1).
  391fwc(fire, nn, 2).
  392fwc('I', pn, 1).
  393fwc(in, pp, 1).
  394fwc(is, vb, 1).
  395fwc(light, nn, 1).
  396fwc(light, vb, 1).
  397fwc('Now', ab, 1).
  398fwc(of, pp, 2).
  399fwc(open, vb, 1).
  400fwc(open, jj, 1).
  401fwc(the, dt, 3).
  402fwc(we, pn, 1).
  403fwc(you, pn, 1).
  404
  405fwcc('.', dl, nn,  2).
  406fwcc(',', jj, dl,1).
  407fwcc(a, vb, dt, 2).
  408fwcc(and, nn,cn,1).
  409fwcc(and, dl,cn, 1).
  410fwcc(beans, pp,nn, 1).
  411fwcc(can, dt,nn,2).
  412fwcc(can, pn,vb, 3).
  413fwcc(eat, vb,vb,1).
  414fwcc(fire, dt,nn, 2).
  415%fwcc('I', start,pn, 1).
  416fwcc(in, vb,pp, 1).
  417fwcc(is, nn,vb, 1).
  418fwcc(light, dt,nn, 1).
  419fwcc(light, vb, vb,1).
  420fwcc('Now', dl,ab, 1).
  421fwcc(of, nn,pp, 2).
  422fwcc(open, vb, vb,1).
  423fwcc(open, vb,jj,1).
  424fwcc(the, ab,dt, 1).
  425fwcc(the,pp, dt, 2).
  426fwcc(we, cn,pn,1).
  427fwcc(you, cn,pn, 1).
  428
  429
  430%fc(start,F):-
  431%  tokens(F).
  432
  433fc(ab, 1).
  434fc(cn, 2).
  435fc(dl, 3).
  436fc(dt, 5).
  437fc(jj, 1).
  438fc(nn, 6).
  439fc(pn, 3).
  440fc(pp, 3).
  441fc(vb, 7).
  442
  443
  444%fcc(start,C1,F):-
  445%  fc(C1,F).
  446
  447
  448fcc(ab, dt, 1).
  449fcc(cn, pn, 2).
  450fcc(dl, ab, 1).
  451fcc(dl, cn, 1).
  452fcc(dt, nn, 5).
  453fcc(jj, dl, 1).
  454fcc(nn, cn, 1).
  455fcc(nn, dl, 2).
  456fcc(nn, pp, 2).
  457fcc(nn, vb, 1).
  458fcc(pn, vb, 3).
  459fcc(pp, dt, 2).
  460fcc(pp, nn, 1).
  461fcc(vb, dt, 2).
  462fcc(vb, jj, 1).
  463fcc(vb, pp, 1).
  464fcc(vb, vb, 3).
  465
  466
  467fccc(ab, dt, nn, 1).
  468fccc(cn, pn, vb, 2).
  469fccc(dl, ab, dt, 1).
  470fccc(dl, cn, pn, 1).
  471fccc(dt, nn, cn, 1).
  472fccc(dt, nn, dl, 1).
  473fccc(dt, nn, pp, 2).
  474fccc(dt, nn, vb, 1).
  475fccc(jj, dl, cn, 1).
  476fccc(nn, cn, pn, 1).
  477fccc(nn, dl, ab, 1).
  478fccc(nn, pp, dt, 1).
  479fccc(nn, pp, nn, 1).
  480fccc(nn, vb, jj, 1).
  481fccc(pn, vb, vb, 3).
  482fccc(pp, nn, dl, 1).
  483fccc(pp, dt, nn, 2).
  484fccc(vb, dt, nn, 2).
  485fccc(vb, jj, dl, 1).
  486fccc(vb, pp, d1, 1).
  487fccc(vb, vb, dt, 2).
  488fccc(vb, vb, pp, 1).
  489
  490classes :-
  491  setof(C,F^fc(C,F),Cs),
  492  length(Cs,N),
  493  assert(classes(N)).
  494
  495tokens :-
  496  bagof(F,W^C^fwc(W,C,F),Fs),
  497  sum_list(Fs,N),
  498  assert(tokens(N)).
  499
  500types :-
  501  setof(W,C^F^fwc(W,C,F),Ws),
  502  length(Ws,N),
  503  assert(types(N)).
  504
  505:- tokens.  506:- types.  507:- classes.