hmmpos2.pl

1/* 21st and 2nd-order Hidden Markov model for part-of-speech tagging. 3This program differs from http://cplint.lamping.unife.it/example/inference/hmmpos.pl because 41. a 1st-order HMM and a 2nd-order HMM are included 52. the probabilistic predicates trans/3, trans2/4, out/3 and out2/4 are defined 6intensionally 73. the probability values are defined on the basis of frequency data from a 8(toy in this example) dataset 9The states represent parts-of-speech, and the symbols emitted by the states are words. 10In the 1st-order HMM, a word depends probabilistically on its own 11part-of-speech (i.e. its tag) which in turn depends on the part-of-speech of 12the preceding word (or on the start state in case there is no preceding word). 13In the 2nd-order HMM, a word depends probabilistically on its own 14part-of-speech and the preceding tag which in turn depends on the 15part-of-speech of the two preceding words (or on the start state in case there 16are no preceding words). 17From 18http://stp.lingfil.uu.se/~nivre/docs/thesis3.pdf 19Original program by Joakim Nivre and Torbjorn Lager, adapted to MCINTYRE by Fabrizio Riguzzi 20*/

?- mc_sample_arg(hmm(S,['I',can,can,a,can]),1000,S,O). % sample the state sequence corresonding to the phrase "I can can a can" % the most frequent state sequence is an approximate POS tagging for the % sentence. It corresponds to the Viterbi path of the HMM. % expected result: the most frequent tagging should be [pn,vb,vb,dt,nn] ?- mc_sample_arg(hmm2(S,['I',can,can,a,can]),1000,S,O). % as above but for the second order model % ?- mc_sample_arg(hmm(S,['I',can,can,a,can]),1000,S,O),argbar(O,C). ?- mc_sample_arg(hmm2(S,['I',can,can,a,can]),1000,S,O),argbar(O,C). ?- mc_sample_arg(hmm(S,[can, the ,can, do, the, can ,can]),10000,S,O),argbar(O,C). % example by Douglas R. Miles ?- mc_sample_arg(hmm(S,[can, the ,can, do, the, can ,can]),10000,S,O),argbar(O,C). */

38:- use_module(library(mcintyre)). 39 40:- if(current_predicate(use_rendering/1)). 41:- use_rendering(c3). 42:- use_rendering(graphviz). 43:- endif. 44 45:- mc. 46 47:- begin_lpad. 48 49% hmm(O): O is the output sequence 50% hmm(S,O): O is the output sequence and S is the sequence of states 51% hmm(Q,S0,S,O): from state Q and previous state S0, generates output O and 52% sequence of states S 53 54hmm(O):-hmm(_,O). 55% O is an output sequence if there is a state seuqnece S such that hmm(S,O) 56% holds 57 58hmm(S,O):-trans(start,Q0,[]),hmm(Q0,[],S0,O),reverse(S0,S). 59% O is an output sequence and S a state sequence if the chain stats at state 60% start and ends generating state sequence S and output sequence O 61 62hmm(Q,S0,S,[L|O]):- 63 trans(Q,Q1,S0), 64 out(Q,L,S0), 65 hmm(Q1,[Q|S0],S,O). 66% an HMM in state Q goes in state Q1, emits the word L 67% and continues the chain 68 69hmm(_,S,S,[]). 70% an HMM in sny state terminates the sequence without emitting any symbol 71 72% hmm2(O): O is the output sequence 73% hmm2(S,O): O is the output sequence and S is the sequence of states 74% hmm2(Q,S0,S1,S,O): from state Q and previous state S1, generates output O and 75% sequence of states S 76 77hmm2(O):-hmm2(_,O). 78% O is an output sequence if there is a state seuqnece S such that hmm1(S,O) 79% holds 80 81hmm2(S,O):-trans2(start,start,Q0,[]),hmm2(start,Q0,[],S0,O),reverse(S0,S). 82% O is an output sequence and S a state sequence if the chain stats at state 83% q1 and ends generating state sequence S and output sequence O 84 85hmm2(Q0,Q,S0,S,[L|O]):- 86 trans2(Q0,Q,Q1,S0), 87 out2(Q0,Q,L,S0), 88 hmm2(Q,Q1,[Q|S0],S,O). 89% an HMM in state Q goes in state Q1, emits the word L 90% and continues the chain 91 92hmm2(_,_,S,S,[]). 93% an HMM in sny state terminates the sequence without emitting any symbol 94 95trans(S0,S1,H):- 96 findall((S,P),pc_c(S,S0,P),L), 97 append(L0,[(LastS,_P)],L), 98 foldl(pick_next_state(S0,H),L0,(1,_),(_,S1)), 99 (var(S1)-> 100 S1=LastS 101 ; 102 true 103 ). 104 105pick_next_state(_S0,_H,_L,(P0,V0),(P0,V0)):- 106 nonvar(V0). 107 108pick_next_state(S0,H,(S,P),(P0,V0),(P1,V1)):- 109 var(V0), 110 PF is P/P0, 111 (prob_fact_state(S0,S,H,PF)-> 112 P1=PF, 113 V1=S 114 ; 115 P1 is P0*(1-PF), 116 V1=V0 117 ). 118 119prob_fact_state(_,_,_,P):P. 120 121out(S0,W,H):- 122 findall((W,P),pw_c(W,S0,P),L), 123 append(L0,[(LastW,_P)],L), 124 foldl(pick_word(S0,H),L0,(1,_),(_,W)), 125 (var(W)-> 126 W=LastW 127 ; 128 true 129 ). 130 131pick_word(_S0,_H,_L,(P0,V0),(P0,V0)):- 132 nonvar(V0). 133 134pick_word(S0,H,(W,P),(P0,V0),(P1,V1)):- 135 var(V0), 136 PF is P/P0, 137 (prob_fact_word(S0,W,H,PF)-> 138 P1=PF, 139 V1=W 140 ; 141 P1 is P0*(1-PF), 142 V1=V0 143 ). 144 145prob_fact_word(_,_,_,P):P. 146 147trans2(S0,S1,S2,H):- 148 findall((S,P),pc_cc(S,S0,S1,P),L), 149 append(L0,[(LastS,_P)],L), 150 foldl(pick_next_state2(S0,S1,H),L0,(1,_),(_,S2)), 151 (var(S2)-> 152 S2=LastS 153 ; 154 true 155 ). 156 157pick_next_state2(_S0,_S1,_H,_L,(P0,V0),(P0,V0)):- 158 nonvar(V0). 159 160pick_next_state2(S0,S1,H,(S,P),(P0,V0),(P1,V1)):- 161 var(V0), 162 PF is P/P0, 163 (prob_fact_state2(S0,S1,S,H,PF)-> 164 P1=PF, 165 V1=S 166 ; 167 P1 is P0*(1-PF), 168 V1=V0 169 ). 170 171prob_fact_state2(_,_,_,_,P):P. 172 173out2(S0,S1,W,H):- 174 findall((W,P),pw_cc(W,S0,S1,P),L), 175 append(L0,[(LastW,_P)],L), 176 foldl(pick_word2(S0,S1,H),L0,(1,_),(_,W)), 177 (var(W)-> 178 W=LastW 179 ; 180 true 181 ). 182 183pick_word2(_S0,_S1,_H,_L,(P0,V0),(P0,V0)):- 184 nonvar(V0). 185 186pick_word2(S0,S1,H,(W,P),(P0,V0),(P1,V1)):- 187 var(V0), 188 PF is P/P0, 189 (prob_fact_word2(S0,S1,W,H,PF)-> 190 P1=PF, 191 V1=W 192 ; 193 P1 is P0*(1-PF), 194 V1=V0 195 ). 196 197prob_fact_word2(_,_,_,_,P):P. 198 199 200 201:- end_lpad. 202 203 204%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 205% 206% con.mle.pl 207% 208% MLE CONTEXTUAL MODEL 209% 210% Maximum likelihood estimation of contextual probabilities 211% P(c), P(c2|c1), P(c3|c1,c2). 212% 213% The definitions presuppose that the following 214% frequencies are stored in the internal database: 215% 216% fc(C,F) - Frequency of class C 217% fcc(C1,C2,F) - Joint frequency of classes C1 and C2 218% fccc(C1,C2,C3) - Joint frequency of classes C1, C2 and C3 219% 220% NB: fc/2, fcc/3 and fccc/4 must also be defined for the dummy 221% class start used to initialize the tagger. Thus, the 222% following must be defined for all classes C1 and C2: 223% 224% fc(start,F). 225% fcc(start,start,F). 226% fcc(start,C1,F). 227% fccc(start,start,C1,F). 228% fccc(start,C1,C2,F). 229% 230%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 231% P(c) 232pc(C,P) :- 233 fc(C,F), 234 tokens(N), 235 P is F/N. 236 237% P(c2|c1) 238pc_c(C,start,P):-!, 239 fc(C,F1), 240 tokens(F2), 241 P is F1/F2. 242 243pc_c(C2,C1,P) :- 244 fcc(C1,C2,F1), 245 fc(C1,F2), 246 P is F1/F2. 247 248pc_c(C2,C1,0) :- 249 \+ fcc(C1,C2,_). 250 251pc_cc(C,start,start,P):-!, 252 pc_c(C,start,P). 253 254pc_cc(C,start,C1,P):-!, 255 pc_c(C,C1,P). 256 257 258% P(c3|c1,c2) 259pc_cc(C3,C1,C2,P) :- 260 fccc(C1,C2,C3,F1), 261 fcc(C1,C2,F2), 262 P is F1/F2. 263 264pc_cc(C3,C1,C2,0) :- 265 \+ fccc(C1,C2,C3,_). 266 267%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 268% 269% lex.mle.pl 270% 271% MLE LEXICAL MODEL 272% 273% Maximum likelihood estimation of lexical probabilities P(w|c). 274% The definition presupposes that the following 275% frequencies are stored in the internal database: 276% 277% fwc(W,C,F) - Joint frequency of word W and class C 278% fc(C,F) - Frequency of class C 279% 280% It also presupposes that the set of open classes (i.e. classes 281% allowed for unknown words) is defined by means of clauses of 282% the form: 283% 284% open(C) 285% 286%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 287/*pw_c(W,C,P) :- 288 fwc(W,C,F1), 289 fc(C,F2), 290 P is F1/F2. 291pw_c(W,C,0) :- 292 \+ fwc(W,_,_), 293 open(C). 294*/ 295%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 296% 297% lex.uni.pl 298% 299% UNIFORM SMOOTHING OF LEXICAL MODEL 300% 301% For known words, the lexical probability is a standard 302% maximum likelihood estimate: 303% 304% P(w|c) = f(w,c) / f(c) 305% 306% except that f(c) is adjusted by adding f(c)/n for open 307% word classes. 308% 309% For unknown words, the lexical probability is 1/n 310% for all open classes, where n is the number of tokens 311% in the training corpus. 312% 313% The definition presupposes that the following 314% frequencies are stored in the internal database: 315% 316% fwc(W,C,F) - Joint frequency of word W and class C 317% fc(C,F) - Frequency of class C 318% 319% It also presupposes that the number of open classes 320% (i.e. those allowed for unknown words) are defined by 321% clauses of the following form: 322% 323% open(C) 324% 325% Finally, it presupposes that the total number of tokens 326% in the training corpus is defined: 327% 328% tokens(N) 329% 330%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 331pw_c(W,C,P) :- 332 fwc(W,C,F1), 333 fc(C,F2), 334 \+ open(C), 335 P is F1/F2. 336 337pw_c(W,C,P) :- 338 fwc(W,C,F1), 339 fc(C,F2), 340 open(C), 341 tokens(N), 342 P is F1/(F2+(F2/N)). 343 344pw_c(W,C,P) :- 345 \+ fwc(W,_,_), 346 open(C), 347 tokens(N), 348 P is 1/N. 349 350pw_cc(W,C1,C2,P) :- 351 fwcc(W,C1,C2,F1), 352 hfreq(F), 353 F1 > F, 354 fcc(C1,C2,F2), 355 P is F1/F2. 356 357pw_cc(W,C1,C2,P) :- 358 pw_c(W,C2,P), 359% fc(C1,_), 360 \+ (fwcc(W,C1,C2,F1), hfreq(F), F1 > F). 361 362open(ab). 363open(in). 364open(jj). 365open(nn). 366open(pc). 367open(p). 368open(rg). 369open(uo). 370open(vb). 371 372hfreq(0). 373 374/* Corpus 375I/pn can/vb light/vb a/dt fire/nn and/cn you/pn 376can/vb open/vb a/dt can/nn of/pp beans/nn ./dl 377Now/ab the/dt can/nn is/vb open/jj ,/dl and/cn 378we/pn can/vb eat/vb in/pp the/dt light/nn of/pp 379the/dt fire/nn ./dl 380 381Frequency Databases 382*/ 383fwc('.', dl, 2). 384fwc(',', dl, 1). 385fwc(a, dt, 2). 386fwc(and, cn, 2). 387fwc(beans, nn, 1). 388fwc(can, nn, 2). 389fwc(can, vb, 3). 390fwc(eat, vb, 1). 391fwc(fire, nn, 2). 392fwc('I', pn, 1). 393fwc(in, pp, 1). 394fwc(is, vb, 1). 395fwc(light, nn, 1). 396fwc(light, vb, 1). 397fwc('Now', ab, 1). 398fwc(of, pp, 2). 399fwc(open, vb, 1). 400fwc(open, jj, 1). 401fwc(the, dt, 3). 402fwc(we, pn, 1). 403fwc(you, pn, 1). 404 405fwcc('.', dl, nn, 2). 406fwcc(',', jj, dl,1). 407fwcc(a, vb, dt, 2). 408fwcc(and, nn,cn,1). 409fwcc(and, dl,cn, 1). 410fwcc(beans, pp,nn, 1). 411fwcc(can, dt,nn,2). 412fwcc(can, pn,vb, 3). 413fwcc(eat, vb,vb,1). 414fwcc(fire, dt,nn, 2). 415%fwcc('I', start,pn, 1). 416fwcc(in, vb,pp, 1). 417fwcc(is, nn,vb, 1). 418fwcc(light, dt,nn, 1). 419fwcc(light, vb, vb,1). 420fwcc('Now', dl,ab, 1). 421fwcc(of, nn,pp, 2). 422fwcc(open, vb, vb,1). 423fwcc(open, vb,jj,1). 424fwcc(the, ab,dt, 1). 425fwcc(the,pp, dt, 2). 426fwcc(we, cn,pn,1). 427fwcc(you, cn,pn, 1). 428 429 430%fc(start,F):- 431% tokens(F). 432 433fc(ab, 1). 434fc(cn, 2). 435fc(dl, 3). 436fc(dt, 5). 437fc(jj, 1). 438fc(nn, 6). 439fc(pn, 3). 440fc(pp, 3). 441fc(vb, 7). 442 443 444%fcc(start,C1,F):- 445% fc(C1,F). 446 447 448fcc(ab, dt, 1). 449fcc(cn, pn, 2). 450fcc(dl, ab, 1). 451fcc(dl, cn, 1). 452fcc(dt, nn, 5). 453fcc(jj, dl, 1). 454fcc(nn, cn, 1). 455fcc(nn, dl, 2). 456fcc(nn, pp, 2). 457fcc(nn, vb, 1). 458fcc(pn, vb, 3). 459fcc(pp, dt, 2). 460fcc(pp, nn, 1). 461fcc(vb, dt, 2). 462fcc(vb, jj, 1). 463fcc(vb, pp, 1). 464fcc(vb, vb, 3). 465 466 467fccc(ab, dt, nn, 1). 468fccc(cn, pn, vb, 2). 469fccc(dl, ab, dt, 1). 470fccc(dl, cn, pn, 1). 471fccc(dt, nn, cn, 1). 472fccc(dt, nn, dl, 1). 473fccc(dt, nn, pp, 2). 474fccc(dt, nn, vb, 1). 475fccc(jj, dl, cn, 1). 476fccc(nn, cn, pn, 1). 477fccc(nn, dl, ab, 1). 478fccc(nn, pp, dt, 1). 479fccc(nn, pp, nn, 1). 480fccc(nn, vb, jj, 1). 481fccc(pn, vb, vb, 3). 482fccc(pp, nn, dl, 1). 483fccc(pp, dt, nn, 2). 484fccc(vb, dt, nn, 2). 485fccc(vb, jj, dl, 1). 486fccc(vb, pp, d1, 1). 487fccc(vb, vb, dt, 2). 488fccc(vb, vb, pp, 1). 489 490classes :- 491 setof(C,F^fc(C,F),Cs), 492 length(Cs,N), 493 assert(classes(N)). 494 495tokens :- 496 bagof(F,W^C^fwc(W,C,F),Fs), 497 sum_list(Fs,N), 498 assert(tokens(N)). 499 500types :- 501 setof(W,C^F^fwc(W,C,F),Ws), 502 length(Ws,N), 503 assert(types(N)). 504 505:- tokens. 506:- types. 507:- classes.