1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2006-2020, University of Amsterdam 7 VU University Amsterdam 8 CWI, Amsterdam 9 All rights reserved. 10 11 Redistribution and use in source and binary forms, with or without 12 modification, are permitted provided that the following conditions 13 are met: 14 15 1. Redistributions of source code must retain the above copyright 16 notice, this list of conditions and the following disclaimer. 17 18 2. Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in 20 the documentation and/or other materials provided with the 21 distribution. 22 23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 POSSIBILITY OF SUCH DAMAGE. 35*/ 36 37:- module(rdf_litindex, 38 [ rdf_set_literal_index_option/1, % +Options 39 rdf_tokenize_literal/2, % +Literal, -Tokens 40 rdf_find_literal/2, % +Spec, -Literal 41 rdf_find_literals/2, % +Spec, -ListOfLiterals 42 rdf_token_expansions/2, % +Spec, -Expansions 43 rdf_stopgap_token/1, % -Token 44 45 rdf_literal_index/2, % +Type, -Index 46 rdf_delete_literal_index/1 % +Type 47 ]). 48:- autoload(rdf_db, 49 [ rdf_keys_in_literal_map/3, 50 rdf_find_literal_map/3, 51 rdf_new_literal_map/1, 52 rdf_monitor/2, 53 rdf_current_literal/1, 54 rdf_reset_literal_map/1, 55 rdf_insert_literal_map/4, 56 rdf_delete_literal_map/2, 57 rdf/3, 58 rdf_delete_literal_map/3, 59 rdf_insert_literal_map/3, 60 rdf_statistics_literal_map/2 61 ]). 62:- autoload(library(apply),[maplist/3]). 63:- use_module(library(debug),[debug/3]). 64:- autoload(library(double_metaphone),[double_metaphone/2]). 65:- autoload(library(error), 66 [instantiation_error/1,must_be/2,domain_error/2]). 67:- autoload(library(lists),[member/2,flatten/2,append/3]). 68:- autoload(library(porter_stem),[tokenize_atom/2]). 69:- autoload(library(snowball),[snowball/3]).
79:- dynamic 80 literal_map/2, % Type, -Map 81 map_building/2, % Type, -Queue 82 new_token/2, % Hook 83 setting/1, 84 stopgap/1. 85:- volatile 86 literal_map/2. 87:- multifile 88 tokenization/2, % +Literal, -Tokens 89 exclude_from_index/2. % +Which, +Token 90 91 92setting(verbose(false)). % print progress messages 93setting(index_threads(1)). % # threads for creating the index 94setting(index(thread(1))). % Use a thread for incremental updates 95setting(stopgap_threshold(50000)). % consider token a stopgap over N
true
, print progress messages while building the
index tables.self
(execute in the same thread), thread(N)
(execute
in N concurrent threads) or default
(depends on number
of cores).119rdf_set_literal_index_option([]) :- !. 120rdf_set_literal_index_option([H|T]) :- 121 !, 122 set_option(H), 123 rdf_set_literal_index_option(T). 124rdf_set_literal_index_option(Option) :- 125 set_option(Option). 126 127set_option(Term) :- 128 check_option(Term), 129 functor(Term, Name, Arity), 130 functor(General, Name, Arity), 131 retractall(setting(General)), 132 assert(setting(Term)). 133 134check_option(X) :- 135 var(X), 136 !, 137 instantiation_error(X). 138check_option(verbose(X)) :- 139 !, 140 must_be(boolean, X). 141check_option(index_threads(Count)) :- 142 !, 143 must_be(nonneg, Count). 144check_option(stopgap_threshold(Count)) :- 145 !, 146 must_be(nonneg, Count). 147check_option(index(How)) :- 148 !, 149 must_be(oneof([default,thread(_),self]), How). 150check_option(Option) :- 151 domain_error(literal_option, Option). 152 153 154 /******************************* 155 * QUERY * 156 *******************************/
Spec ::= and(Spec,Spec) Spec ::= or(Spec,Spec) Spec ::= not(Spec) Spec ::= sounds(Like) Spec ::= stem(Like) % same as stem(Like, en) Spec ::= stem(Like, Lang) Spec ::= prefix(Prefix) Spec ::= between(Low, High) % Numerical between Spec ::= ge(High) % Numerical greater-equal Spec ::= le(Low) % Numerical less-equal Spec ::= Token
sounds(Like)
and stem(Like)
both map to a disjunction. First we
compile the spec to normal form: a disjunction of conjunctions
on elementary tokens. Then we execute all the conjunctions and
generate the union using ordered-set algorithms.
Stopgaps are ignored. If the final result is only a stopgap, the predicate fails.
188rdf_find_literal(Spec, Literal) :- 189 rdf_find_literals(Spec, Literals), 190 member(Literal, Literals). 191 192rdf_find_literals(Spec, Literals) :- 193 compile_spec(Spec, DNF), 194 DNF \== @(stopgap), 195 token_index(Map), 196 lookup(DNF, Map, _, SuperSet), 197 flatten(SuperSet, Set0), 198 sort(Set0, Literals).
205rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :- 206 token_index(Map), 207 rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens). 208rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :- 209 metaphone_index(Map), 210 rdf_find_literal_map(Map, [Like], Tokens). 211rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :- 212 stem_index(Map), 213 rdf_find_literal_map(Map, [Like], Tokens). 214rdf_token_expansions(Spec, Expansions) :- 215 compile_spec(Spec, DNF), 216 token_index(Map), 217 lookup(DNF, Map, SCS, _), 218 flatten(SCS, CS), 219 sort(CS, Expansions0), 220 join_expansions(Expansions0, Expansions). 221 222join_expansions([], []). 223join_expansions([H0|T0], [H|T]) :- 224 untag(H0, Tag, V0), 225 Tag =.. L0, 226 append(L0, [[V0|Values]], L1), 227 H =.. L1, 228 join_expansions_by_tag(T0, Tag, T1, Values), 229 join_expansions(T1, T). 230 231join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :- 232 untag(H, Tag, V0), 233 !, 234 join_expansions_by_tag(T0, Tag, T, VT). 235join_expansions_by_tag(L, _, L, []). 236 237lookup(@(false), _, [], []) :- !. 238lookup(or(H0,T0), Map, [CH|CT], [H|T]) :- 239 !, 240 lookup(H0, Map, CH, H), 241 lookup(T0, Map, CT, T). 242lookup(H0, Map, [C], [H]) :- 243 lookup1(H0, Map, C, H). 244 245lookup1(Conj, Map, Cond, Literals) :- 246 phrase(conj_to_list(Conj), List), 247 !, 248 rdf_find_literal_map(Map, List, Literals), 249 ( Literals \== [] 250 -> phrase(conj_to_cond(Conj), Cond) 251 ; Cond = [] 252 ). 253lookup1(_, _, _, []). 254 255conj_to_list(and(A,B)) --> 256 !, 257 conj_to_list(A), 258 conj_to_list(B). 259conj_to_list(@(false)) --> 260 !, 261 {fail}. 262conj_to_list(Tagged) --> 263 { untag(Tagged, L) }, 264 !, 265 [L]. 266conj_to_list(L) --> 267 [L]. 268 269 270conj_to_cond(and(A,B)) --> 271 !, 272 conj_to_cond(A), 273 conj_to_cond(B). 274conj_to_cond(Tagged) --> 275 { untag(Tagged, _) }, 276 !, 277 [ Tagged ]. 278conj_to_cond(_) --> 279 [].
286compile_spec(Spec, DNF) :- 287 expand_fuzzy(Spec, Spec2), 288 nnf(Spec2, NNF), 289 dnf(NNF, DNF). 290 291 292expand_fuzzy(Var, _) :- 293 var(Var), 294 !, 295 throw(error(instantiation_error, _)). 296expand_fuzzy(sounds(Like), Or) :- 297 !, 298 ( atom(Like) 299 -> metaphone_index(Map), 300 double_metaphone(Like, Key), 301 rdf_find_literal_map(Map, [Key], Tokens), 302 list_to_or(Tokens, sounds(Like), Or) 303 ; expand_fuzzy(Like, Or) 304 ). 305expand_fuzzy(stem(Like), Or) :- 306 !, 307 expand_fuzzy(stem(Like, en), Or). 308expand_fuzzy(stem(Like, Lang), Or) :- 309 !, 310 ( atom(Like) 311 -> stem_index(Map), 312 stem(Like, Lang, Key), 313 rdf_find_literal_map(Map, [Key], Tokens), 314 list_to_or(Tokens, stem(Like), Or) 315 ; expand_fuzzy(Like, Or) 316 ). 317expand_fuzzy(prefix(Prefix), Or) :- 318 !, 319 ( atom(Prefix) 320 -> token_index(Map), 321 rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens), 322 list_to_or(Tokens, prefix(Prefix), Or) 323 ; expand_fuzzy(Prefix, Or) 324 ). 325expand_fuzzy(case(String), Or) :- 326 !, 327 ( atom(String) 328 -> token_index(Map), 329 rdf_keys_in_literal_map(Map, case(String), Tokens), 330 list_to_or(Tokens, case(String), Or) 331 ; expand_fuzzy(String, Or) 332 ). 333expand_fuzzy(or(A0, B0), E) :- 334 !, 335 expand_fuzzy(A0, A), 336 expand_fuzzy(B0, B), 337 simplify(or(A,B), E). 338expand_fuzzy(and(A0, B0), E) :- 339 !, 340 expand_fuzzy(A0, A), 341 expand_fuzzy(B0, B), 342 simplify(and(A,B), E). 343expand_fuzzy(not(A0), not(A)) :- 344 !, 345 expand_fuzzy(A0, A). 346expand_fuzzy(between(Low, High), Or) :- 347 !, 348 token_index(Map), 349 rdf_keys_in_literal_map(Map, between(Low, High), Tokens), 350 list_to_or(Tokens, between(Low, High), Or). 351expand_fuzzy(le(High), Or) :- 352 !, 353 token_index(Map), 354 rdf_keys_in_literal_map(Map, le(High), Tokens), 355 list_to_or(Tokens, le(High), Or). 356expand_fuzzy(ge(Low), Or) :- 357 !, 358 token_index(Map), 359 rdf_keys_in_literal_map(Map, ge(Low), Tokens), 360 list_to_or(Tokens, ge(Low), Or). 361expand_fuzzy(Token, Result) :- 362 atomic(Token), 363 !, 364 ( rdf_stopgap_token(Token) 365 -> Result = @(stopgap) 366 ; Result = Token 367 ). 368expand_fuzzy(Token, _) :- 369 throw(error(type_error(Token, boolean_expression), _)). 370 371simplify(Expr0, Expr) :- 372 simple(Expr0, Expr), 373 !. 374simplify(Expr, Expr). 375 376simple(and(@(false), _), @(false)). 377simple(and(_, @(false)), @(false)). 378simple(and(@(stopgap), Token), Token). 379simple(and(Token, @(stopgap)), Token). 380simple(or(@(false), X), X). 381simple(or(X, @(false)), X). 382simple(or(@(stopgap), Token), Token). 383simple(or(Token, @(stopgap)), Token). 384 385 386list_to_or([], _, @(false)) :- !. 387list_to_or([X], How, One) :- 388 !, 389 tag(How, X, One). 390list_to_or([H0|T0], How, or(H, T)) :- 391 tag(How, H0, H), 392 list_to_or(T0, How, T). 393 394tag(sounds(X), Y, sounds(X,Y)). 395tag(stem(X), Y, stem(X,Y)). 396tag(prefix(X), Y, prefix(X,Y)). 397tag(case(X), Y, case(X,Y)). 398tag(between(L,H), Y, between(L,H,Y)). 399tag(ge(L), Y, ge(L,Y)). 400tag(le(H), Y, le(H,Y)). 401 402untag(sounds(_,Y), Y). 403untag(stem(_,Y), Y). 404untag(prefix(_,Y), Y). 405untag(case(_,Y), Y). 406untag(between(_,_,Y), Y). 407untag(le(_,Y), Y). 408untag(ge(_,Y), Y). 409 410untag(sounds(X,Y), sounds(X), Y). 411untag(stem(X,Y), stem(X), Y). 412untag(prefix(X,Y), prefix(X), Y). 413untag(case(X,Y), case(X), Y). 414untag(between(L,H,Y), between(L,H), Y). 415untag(ge(L,Y), ge(L), Y). 416untag(le(H,Y), le(H), Y).
424nnf(not(not(A0)), A) :- 425 !, 426 nnf(A0, A). 427nnf(not(and(A0,B0)), or(A,B)) :- 428 !, 429 nnf(not(A0), A), 430 nnf(not(B0), B). 431nnf(not(or(A0,B0)), and(A,B)) :- 432 !, 433 nnf(not(A0), A), 434 nnf(not(B0), B). 435nnf(A, A).
442dnf(or(A0,B0), or(A, B)) :- 443 !, 444 dnf(A0, A), 445 dnf(B0, B). 446dnf(and(A0,B0), DNF):- 447 !, 448 dnf(A0, A1), 449 dnf(B0, B1), 450 dnf1(and(A1,B1), DNF). 451dnf(DNF, DNF). 452 453dnf1(and(A0, or(B,C)), or(P,Q)) :- 454 !, 455 dnf1(and(A0,B), P), 456 dnf1(and(A0,C), Q). 457dnf1(and(or(B,C), A0), or(P,Q)) :- 458 !, 459 dnf1(and(A0,B), P), 460 dnf1(and(A0,C), Q). 461dnf1(DNF, DNF). 462 463 464 /******************************* 465 * TOKEN INDEX * 466 *******************************/
474token_index(Map) :- 475 literal_map(token, Map), 476 !, 477 wait_for_map(token). 478token_index(Map) :- 479 rdf_new_literal_map(Map), 480 assert(literal_map(token, Map)), 481 register_token_updater, 482 message_queue_create(Queue), 483 assert(map_building(token, Queue)), 484 thread_create(make_literal_index(Queue), _, 485 [ alias('__rdf_tokenizer'), 486 detached(true) 487 ]), 488 wait_for_map(token). 489 490register_token_updater :- 491 Monitor = [ reset, 492 new_literal, 493 old_literal 494 ], 495 ( setting(index(default)) 496 -> create_update_literal_thread(1), 497 rdf_monitor(thread_monitor_literal, Monitor) 498 ; setting(index(thread(N))) 499 -> create_update_literal_thread(N), 500 rdf_monitor(thread_monitor_literal, Monitor) 501 ; rdf_monitor(monitor_literal, Monitor) 502 ). 503 504make_literal_index(Queue) :- 505 call_cleanup( 506 make_literal_index, 507 ( message_queue_destroy(Queue), 508 retractall(map_building(token, _)))).
514make_literal_index :- 515 setting(index_threads(N)), 516 !, 517 threaded_literal_index(N), 518 verbose('~N', []). 519make_literal_index :- 520 current_prolog_flag(cpu_count, X), 521 threaded_literal_index(X), 522 verbose('~N', []). 523 524threaded_literal_index(N) :- 525 N > 1, 526 !, 527 message_queue_create(Q, [max_size(1000)]), 528 create_index_threads(N, Q, Ids), 529 forall(rdf_current_literal(Literal), 530 thread_send_message(Q, Literal)), 531 forall(between(1, N, _), 532 thread_send_message(Q, done(true))), 533 maplist(thread_join, Ids, _). 534threaded_literal_index(_) :- 535 forall(rdf_current_literal(Literal), 536 register_literal(Literal)). 537 538create_index_threads(N, Q, [Id|T]) :- 539 N > 0, 540 !, 541 thread_create(index_worker(Q), Id, []), 542 N2 is N - 1, 543 create_index_threads(N2, Q, T). 544create_index_threads(_, _, []) :- !. 545 546index_worker(Queue) :- 547 repeat, 548 thread_get_message(Queue, Msg), 549 work(Msg). 550 551work(done(true)) :- !. 552work(Literal) :- 553 register_literal(Literal), 554 fail.
561clean_token_index :-
562 forall(literal_map(_, Map),
563 rdf_reset_literal_map(Map)),
564 retractall(stopgap(_)).
570rdf_delete_literal_index(Type) :- 571 must_be(atom, Type), 572 ( retract(literal_map(Type, Map)) 573 -> rdf_reset_literal_map(Map) % destroy is unsafe 574 ). 575 576 /******************************* 577 * THREADED UPDATE * 578 *******************************/
rdf_persistency.pl
, most of the
time is spent updating the literal token database. While loading
the RDF triples, most of the time is spend in updating the AVL
tree holding the literals. Updating the token index hangs on
updating the AVL trees holding the tokens. Both tasks however
can run concurrently.590create_update_literal_thread(Threads) :- 591 message_queue_create(_, 592 [ alias(rdf_literal_monitor_queue), 593 max_size(50000) 594 ]), 595 forall(between(1, Threads, _), 596 create_index_worker(initial)). 597 598:- dynamic 599 index_worker_id/1, 600 extra_worker_count/1. 601 602create_index_worker(Status) :- 603 ( retract(index_worker_id(Id0)) 604 -> true 605 ; Id0 = 1 606 ), 607 succ(Id0, Id1), 608 assertz(index_worker_id(Id1)), 609 atom_concat(rdf_literal_monitor_, Id0, Alias), 610 inc_extra_worker_count(Status), 611 thread_create(monitor_literals(Status), _, 612 [ alias(Alias) 613 ]). 614 615monitor_literals(initial) :- 616 set_prolog_flag(agc_margin, 0), % we don't create garbage 617 repeat, 618 thread_get_message(rdf_literal_monitor_queue, Literal), 619 register_literal(Literal), 620 fail. 621monitor_literals(extra) :- 622 set_prolog_flag(agc_margin, 0), 623 repeat, 624 ( thread_get_message(rdf_literal_monitor_queue, Literal, 625 [ timeout(1) 626 ]) 627 -> register_literal(Literal), 628 fail 629 ; ! 630 ), 631 with_mutex(create_index_worker, dec_extra_worker_count), 632 thread_self(Me), 633 thread_detach(Me). 634 635thread_monitor_literal(new_literal(Literal)) :- 636 !, 637 thread_send_message(rdf_literal_monitor_queue, Literal). 638thread_monitor_literal(Action) :- 639 !, 640 monitor_literal(Action).
647check_index_workers(Alias, Keys) :- 648 max_extra_workers(Max), 649 Max > 0, 650 message_queue_property(Queue, alias(Alias)), 651 message_queue_property(Queue, size(Size)), 652 Size > 10000, 653 \+ ( extra_worker_count(Extra), 654 Extra >= Max 655 ), 656 !, 657 debug(rdf_litindex, 658 'Creating extra literal indexer (Queue=~D, Keys=~D)', 659 [Size, Keys]), 660 with_mutex(create_index_worker, create_index_worker(extra)). 661check_index_workers(_, _). 662 663inc_extra_worker_count(extra) :- 664 !, 665 ( retract(extra_worker_count(C0)) 666 -> C is C0+1 667 ; C = 1 668 ), 669 asserta(extra_worker_count(C)). 670inc_extra_worker_count(_). 671 672dec_extra_worker_count :- 673 retract(extra_worker_count(C0)), 674 !, 675 C is C0-1, 676 asserta(extra_worker_count(C)). 677dec_extra_worker_count. 678 679max_extra_workers(Max) :- 680 current_prolog_flag(cpu_count, Count), 681 Max is Count//2. 682 683 684 /******************************* 685 * MONITORED UPDATE * 686 *******************************/ 687 688monitor_literal(new_literal(Literal)) :- 689 register_literal(Literal). 690monitor_literal(old_literal(Literal)) :- 691 unregister_literal(Literal). 692monitor_literal(transaction(begin, reset)) :- 693 rdf_monitor(monitor_literal, [-old_literal]), 694 clean_token_index. 695monitor_literal(transaction(end, reset)) :- 696 rdf_monitor(monitor_literal, [+old_literal]).
702register_literal(Literal) :- 703 ( rdf_tokenize_literal(Literal, Tokens0) 704 -> sort(Tokens0, Tokens), 705 text_of(Literal, Lang, Text), 706 literal_map(token, Map), 707 add_tokens(Tokens, Lang, Text, Map) 708 ; true 709 ). 710 711add_tokens([], _, _, _). 712add_tokens([H|T], Lang, Literal, Map) :- 713 rdf_insert_literal_map(Map, H, Literal, Keys), 714 ( var(Keys) 715 -> ( rdf_keys_in_literal_map(Map, key(H), Count), 716 setting(stopgap_threshold(Threshold)), 717 Count > Threshold 718 -> assert(stopgap(H)), 719 rdf_delete_literal_map(Map, H) 720 ; true 721 ) 722 ; forall(new_token(H, Lang), true), 723 ( Keys mod 1000 =:= 0 724 -> progress(Map, 'Tokens'), 725 ( Keys mod 10000 =:= 0 726 -> check_index_workers(rdf_literal_monitor_queue, Keys) 727 ; true 728 ) 729 ; true 730 ) 731 ), 732 add_tokens(T, Lang, Literal, Map).
741unregister_literal(Literal) :- 742 text_of(Literal, _Lang, Text), 743 ( rdf(_,_,literal(Text)) 744 -> true % still something left 745 ; rdf_tokenize_literal(Literal, Tokens0), 746 sort(Tokens0, Tokens), 747 literal_map(token, Map), 748 del_tokens(Tokens, Text, Map) 749 ). 750 751del_tokens([], _, _). 752del_tokens([H|T], Literal, Map) :- 753 rdf_delete_literal_map(Map, H, Literal), 754 del_tokens(T, Literal, Map).
762rdf_tokenize_literal(Literal, Tokens) :- 763 tokenization(Literal, Tokens), 764 !. % Hook 765rdf_tokenize_literal(Literal, Tokens) :- 766 text_of(Literal, _Lang, Text), 767 atom(Text), 768 tokenize_atom(Text, Tokens0), 769 select_tokens(Tokens0, Tokens). 770 771select_tokens([], []). 772select_tokens([H|T0], T) :- 773 ( exclude_from_index(token, H) 774 -> select_tokens(T0, T) 775 ; number(H) 776 -> ( integer(H), 777 between(-1073741824, 1073741823, H) 778 -> T = [H|T1], 779 select_tokens(T0, T1) 780 ; select_tokens(T0, T) 781 ) 782 ; atom_length(H, 1) 783 -> select_tokens(T0, T) 784 ; default_stopgap(H) 785 -> select_tokens(T0, T) 786 ; stopgap(H) 787 -> select_tokens(T0, T) 788 ; T = [H|T1], 789 select_tokens(T0, T1) 790 ).
exclude_from_index(token, Token)
is truedefault_stopgap(Token)
is true803rdf_stopgap_token(Token) :- 804 ( var(Token) 805 -> rdf_stopgap_token2(Token) 806 ; rdf_stopgap_token2(Token), ! 807 ). 808 809rdf_stopgap_token2(Token) :- 810 exclude_from_index(token, Token). 811rdf_stopgap_token2(Token) :- 812 default_stopgap(Token). 813rdf_stopgap_token2(Token) :- 814 atom(Token), 815 atom_length(Token, 1). 816rdf_stopgap_token2(Token) :- 817 stopgap(Token).
826default_stopgap(and). 827default_stopgap(an). 828default_stopgap(or). 829default_stopgap(of). 830default_stopgap(on). 831default_stopgap(in). 832default_stopgap(this). 833default_stopgap(the).
844text_of(type(xsd:string, Text), en, Text) :- !. 845text_of(type(_, Text), -, Text) :- !. 846text_of(lang(Lang, Text), Lang, Text) :- !. 847text_of(Text, en, Text) :- atom(Text), !. 848text_of(Text, -, Text) :- integer(Text). 849 850 851 /******************************* 852 * STEM INDEX * 853 *******************************/
861stem_index(Map) :- 862 literal_map(stem, Map), 863 !, 864 wait_for_map(stem). 865stem_index(Map) :- 866 rdf_new_literal_map(Map), 867 assert(literal_map(stem, Map)), 868 assert((new_token(Token, Lang) :- add_stem(Token, Lang, Map))), 869 message_queue_create(Queue), 870 assert(map_building(stem, Queue)), 871 thread_create(fill_stem_index(Map, Queue), _, 872 [ alias('__rdf_stemmer'), 873 detached(true) 874 ]), 875 wait_for_map(stem). 876 877wait_for_map(MapName) :- 878 ( map_building(MapName, Queue) 879 -> catch(thread_get_message(Queue, _), _, true), 880 wait_for_map(MapName) 881 ; true 882 ). 883 884fill_stem_index(StemMap, Queue) :- 885 call_cleanup( 886 forall(rdf_current_literal(Literal), 887 stem_literal_tokens(Literal, StemMap)), 888 ( message_queue_destroy(Queue), 889 retractall(map_building(stem, _)))). 890 891stem_literal_tokens(Literal, StemMap) :- 892 rdf_tokenize_literal(Literal, Tokens), 893 !, 894 sort(Tokens, Tokens1), 895 text_of(Literal, Lang, _Text), 896 insert_tokens_stem(Tokens1, Lang, StemMap). 897stem_literal_tokens(_,_). 898 899insert_tokens_stem([], _, _). 900insert_tokens_stem([Token|T], Lang, Map) :- 901 ( atom(Token) 902 -> ( stem(Token, Lang, Stem) 903 -> rdf_insert_literal_map(Map, Stem, Token, Keys), 904 ( integer(Keys), 905 Keys mod 1000 =:= 0 906 -> progress(Map, 'Stem') 907 ; true 908 ) 909 ; true 910 ) 911 ; true 912 ), 913 insert_tokens_stem(T, Lang, Map). 914 915 916add_stem(Token, Lang, Map) :- 917 stem(Lang, Token, Stem), 918 rdf_insert_literal_map(Map, Stem, Token, _). 919 920stem(Token, LangSpec, Stem) :- 921 main_lang(LangSpec, Lang), 922 downcase_atom(Token, Lower), 923 catch(snowball(Lang, Lower, Stem), _, fail). 924 925main_lang(LangSpec, Lang) :- 926 sub_atom(LangSpec, Before, _, _, -), 927 !, 928 sub_atom(LangSpec, 0, Before, _, Lang). 929main_lang(LangSpec, Lang) :- 930 downcase_atom(LangSpec, Lang). 931 932 933 /******************************* 934 * METAPHONE INDEX * 935 *******************************/ 936 937 938metaphone_index(Map) :- 939 literal_map(metaphone, Map), 940 !, 941 wait_for_map(metaphone). 942metaphone_index(Map) :- 943 rdf_new_literal_map(Map), 944 assert(literal_map(metaphone, Map)), 945 assert((new_token(Token, Lang) :- add_metaphone(Token, Lang, Map))), 946 message_queue_create(Queue), 947 assert(map_building(metaphone, Queue)), 948 thread_create(fill_metaphone_index(Map, Queue), _, 949 [ alias('__rdf_metaphone_indexer'), 950 detached(true) 951 ]), 952 wait_for_map(metaphone). 953 954fill_metaphone_index(MetaphoneMap, Queue) :- 955 call_cleanup( 956 fill_metaphone_index(MetaphoneMap), 957 ( message_queue_destroy(Queue), 958 retractall(map_building(metaphone, _)))). 959 960fill_metaphone_index(MetaphoneMap) :- 961 token_index(TokenMap), 962 rdf_keys_in_literal_map(TokenMap, all, Tokens), 963 metaphone(Tokens, MetaphoneMap). 964 965metaphone([], _). 966metaphone([Token|T], Map) :- 967 ( atom(Token), 968 double_metaphone(Token, SoundEx) 969 -> rdf_insert_literal_map(Map, SoundEx, Token, Keys), 970 ( integer(Keys), 971 Keys mod 1000 =:= 0 972 -> progress(Map, 'Metaphone') 973 ; true 974 ) 975 ; true 976 ), 977 metaphone(T, Map). 978 979 980add_metaphone(Token, _Lang, Map) :- 981 atom(Token), 982 !, 983 double_metaphone(Token, SoundEx), 984 rdf_insert_literal_map(Map, SoundEx, Token). 985add_metaphone(_, _, _).
token
map maps tokens to full
literal texts.stem
map maps stemmed to full tokens.metaphone
map maps phonetic
keys to tokens.1004rdf_literal_index(token, Map) :- 1005 !, 1006 token_index(Map). 1007rdf_literal_index(stem, Map) :- 1008 !, 1009 stem_index(Map). 1010rdf_literal_index(metaphone, Map) :- 1011 !, 1012 metaphone_index(Map). 1013rdf_literal_index(Type, _Map) :- 1014 domain_error(literal_index, Type). 1015 1016 1017 /******************************* 1018 * UTIL * 1019 *******************************/ 1020 1021verbose(Fmt, Args) :- 1022 setting(verbose(true)), 1023 !, 1024 format(user_error, Fmt, Args). 1025verbose(_, _). 1026 1027progress(Map, Which) :- 1028 setting(verbose(true)), 1029 !, 1030 rdf_statistics_literal_map(Map, size(Keys, Values)), 1031 format(user_error, 1032 '\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+', 1033 [Which, Keys, Values]). 1034progress(_,_)
Search literals
This module finds literals of the RDF database based on words, stemming and sounds like (metaphone). The normal user-level predicate is
*/