View source with raw comments or as raw

    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2006-2020, University of Amsterdam
    7                              VU University Amsterdam
    8                              CWI, Amsterdam
    9    All rights reserved.
   10
   11    Redistribution and use in source and binary forms, with or without
   12    modification, are permitted provided that the following conditions
   13    are met:
   14
   15    1. Redistributions of source code must retain the above copyright
   16       notice, this list of conditions and the following disclaimer.
   17
   18    2. Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in
   20       the documentation and/or other materials provided with the
   21       distribution.
   22
   23    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   24    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   25    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   26    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   27    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   28    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   29    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   30    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   31    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   33    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   34    POSSIBILITY OF SUCH DAMAGE.
   35*/
   36
   37:- module(rdf_litindex,
   38          [ rdf_set_literal_index_option/1,     % +Options
   39            rdf_tokenize_literal/2,             % +Literal, -Tokens
   40            rdf_find_literal/2,                 % +Spec, -Literal
   41            rdf_find_literals/2,                % +Spec, -ListOfLiterals
   42            rdf_token_expansions/2,             % +Spec, -Expansions
   43            rdf_stopgap_token/1,                % -Token
   44
   45            rdf_literal_index/2,                % +Type, -Index
   46            rdf_delete_literal_index/1          % +Type
   47          ]).   48:- autoload(rdf_db,
   49	    [ rdf_keys_in_literal_map/3,
   50	      rdf_find_literal_map/3,
   51	      rdf_new_literal_map/1,
   52	      rdf_monitor/2,
   53	      rdf_current_literal/1,
   54	      rdf_reset_literal_map/1,
   55	      rdf_insert_literal_map/4,
   56	      rdf_delete_literal_map/2,
   57	      rdf/3,
   58	      rdf_delete_literal_map/3,
   59	      rdf_insert_literal_map/3,
   60	      rdf_statistics_literal_map/2
   61	    ]).   62:- autoload(library(apply),[maplist/3]).   63:- use_module(library(debug),[debug/3]).   64:- autoload(library(double_metaphone),[double_metaphone/2]).   65:- autoload(library(error),
   66	    [instantiation_error/1,must_be/2,domain_error/2]).   67:- autoload(library(lists),[member/2,flatten/2,append/3]).   68:- autoload(library(porter_stem),[tokenize_atom/2]).   69:- autoload(library(snowball),[snowball/3]).

Search literals

This module finds literals of the RDF database based on words, stemming and sounds like (metaphone). The normal user-level predicate is

rdf_find_literals/2

   79:- dynamic
   80    literal_map/2,                  % Type, -Map
   81    map_building/2,                 % Type, -Queue
   82    new_token/2,                    % Hook
   83    setting/1,
   84    stopgap/1.   85:- volatile
   86    literal_map/2.   87:- multifile
   88    tokenization/2,                 % +Literal, -Tokens
   89    exclude_from_index/2.           % +Which, +Token
   90
   91
   92setting(verbose(false)).                % print progress messages
   93setting(index_threads(1)).              % # threads for creating the index
   94setting(index(thread(1))).              % Use a thread for incremental updates
   95setting(stopgap_threshold(50000)).      % consider token a stopgap over N

rdf_set_literal_index_option(+Options:list)

Set options for the literal package. Currently defined options

verbose(Bool): If true, print progress messages while building the index tables.
index_threads(+Count): Number of threads to use for initial indexing of literals
index(+How): How to deal with indexing new literals. How is one of self (execute in the same thread), thread(N) (execute in N concurrent threads) or default (depends on number of cores).
stopgap_threshold(+Count): Add a token to the dynamic stopgap set if it appears in more than Count literals. The default is 50,000.

  119rdf_set_literal_index_option([]) :- !.
  120rdf_set_literal_index_option([H|T]) :-
  121    !,
  122    set_option(H),
  123    rdf_set_literal_index_option(T).
  124rdf_set_literal_index_option(Option) :-
  125    set_option(Option).
  126
  127set_option(Term) :-
  128    check_option(Term),
  129    functor(Term, Name, Arity),
  130    functor(General, Name, Arity),
  131    retractall(setting(General)),
  132    assert(setting(Term)).
  133
  134check_option(X) :-
  135    var(X),
  136    !,
  137    instantiation_error(X).
  138check_option(verbose(X)) :-
  139    !,
  140    must_be(boolean, X).
  141check_option(index_threads(Count)) :-
  142    !,
  143    must_be(nonneg, Count).
  144check_option(stopgap_threshold(Count)) :-
  145    !,
  146    must_be(nonneg, Count).
  147check_option(index(How)) :-
  148    !,
  149    must_be(oneof([default,thread(_),self]), How).
  150check_option(Option) :-
  151    domain_error(literal_option, Option).
  152
  153
  154                 /*******************************
  155                 *            QUERY             *
  156                 *******************************/

rdf_find_literal(+Spec, -Literal) is nondet

rdf_find_literals(+Spec, -Literals) is det

Find literals in the RDF database matching Spec. Spec is defined as:

Spec ::= and(Spec,Spec)
Spec ::= or(Spec,Spec)
Spec ::= not(Spec)
Spec ::= sounds(Like)
Spec ::= stem(Like)             % same as stem(Like, en)
Spec ::= stem(Like, Lang)
Spec ::= prefix(Prefix)
Spec ::= between(Low, High)     % Numerical between
Spec ::= ge(High)               % Numerical greater-equal
Spec ::= le(Low)                % Numerical less-equal
Spec ::= Token

sounds(Like) and stem(Like) both map to a disjunction. First we compile the spec to normal form: a disjunction of conjunctions on elementary tokens. Then we execute all the conjunctions and generate the union using ordered-set algorithms.

Stopgaps are ignored. If the final result is only a stopgap, the predicate fails.

To be done: - Exploit ordering of numbers and allow for > N, < N, etc.

  188rdf_find_literal(Spec, Literal) :-
  189    rdf_find_literals(Spec, Literals),
  190    member(Literal, Literals).
  191
  192rdf_find_literals(Spec, Literals) :-
  193    compile_spec(Spec, DNF),
  194    DNF \== @(stopgap),
  195    token_index(Map),
  196    lookup(DNF, Map, _, SuperSet),
  197    flatten(SuperSet, Set0),
  198    sort(Set0, Literals).

rdf_token_expansions(+Spec, -Extensions): Determine which extensions of a token contribute to finding literals.

  205rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :-
  206    token_index(Map),
  207    rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens).
  208rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :-
  209    metaphone_index(Map),
  210    rdf_find_literal_map(Map, [Like], Tokens).
  211rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :-
  212    stem_index(Map),
  213    rdf_find_literal_map(Map, [Like], Tokens).
  214rdf_token_expansions(Spec, Expansions) :-
  215    compile_spec(Spec, DNF),
  216    token_index(Map),
  217    lookup(DNF, Map, SCS, _),
  218    flatten(SCS, CS),
  219    sort(CS, Expansions0),
  220    join_expansions(Expansions0, Expansions).
  221
  222join_expansions([], []).
  223join_expansions([H0|T0], [H|T]) :-
  224    untag(H0, Tag, V0),
  225    Tag =.. L0,
  226    append(L0, [[V0|Values]], L1),
  227    H =.. L1,
  228    join_expansions_by_tag(T0, Tag, T1, Values),
  229    join_expansions(T1, T).
  230
  231join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :-
  232    untag(H, Tag, V0),
  233    !,
  234    join_expansions_by_tag(T0, Tag, T, VT).
  235join_expansions_by_tag(L, _, L, []).
  236
  237lookup(@(false), _, [], []) :- !.
  238lookup(or(H0,T0), Map, [CH|CT], [H|T]) :-
  239    !,
  240    lookup(H0, Map, CH, H),
  241    lookup(T0, Map, CT, T).
  242lookup(H0, Map, [C], [H]) :-
  243    lookup1(H0, Map, C, H).
  244
  245lookup1(Conj, Map, Cond, Literals) :-
  246    phrase(conj_to_list(Conj), List),
  247    !,
  248    rdf_find_literal_map(Map, List, Literals),
  249    (   Literals \== []
  250    ->  phrase(conj_to_cond(Conj), Cond)
  251    ;   Cond = []
  252    ).
  253lookup1(_, _, _, []).
  254
  255conj_to_list(and(A,B)) -->
  256    !,
  257    conj_to_list(A),
  258    conj_to_list(B).
  259conj_to_list(@(false)) -->
  260    !,
  261    {fail}.
  262conj_to_list(Tagged) -->
  263    { untag(Tagged, L) },
  264    !,
  265    [L].
  266conj_to_list(L) -->
  267    [L].
  268
  269
  270conj_to_cond(and(A,B)) -->
  271    !,
  272    conj_to_cond(A),
  273    conj_to_cond(B).
  274conj_to_cond(Tagged) -->
  275    { untag(Tagged, _) },
  276    !,
  277    [ Tagged ].
  278conj_to_cond(_) -->
  279    [].

compile_spec(+Spec, -Compiled): Compile a specification as above into disjunctive normal form

  286compile_spec(Spec, DNF) :-
  287    expand_fuzzy(Spec, Spec2),
  288    nnf(Spec2, NNF),
  289    dnf(NNF, DNF).
  290
  291
  292expand_fuzzy(Var, _) :-
  293    var(Var),
  294    !,
  295    throw(error(instantiation_error, _)).
  296expand_fuzzy(sounds(Like), Or) :-
  297    !,
  298    (   atom(Like)
  299    ->  metaphone_index(Map),
  300        double_metaphone(Like, Key),
  301        rdf_find_literal_map(Map, [Key], Tokens),
  302        list_to_or(Tokens, sounds(Like), Or)
  303    ;   expand_fuzzy(Like, Or)
  304    ).
  305expand_fuzzy(stem(Like), Or) :-
  306    !,
  307    expand_fuzzy(stem(Like, en), Or).
  308expand_fuzzy(stem(Like, Lang), Or) :-
  309    !,
  310    (   atom(Like)
  311    ->  stem_index(Map),
  312        stem(Like, Lang, Key),
  313        rdf_find_literal_map(Map, [Key], Tokens),
  314        list_to_or(Tokens, stem(Like), Or)
  315    ;   expand_fuzzy(Like, Or)
  316    ).
  317expand_fuzzy(prefix(Prefix), Or) :-
  318    !,
  319    (   atom(Prefix)
  320    ->  token_index(Map),
  321        rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens),
  322        list_to_or(Tokens, prefix(Prefix), Or)
  323    ;   expand_fuzzy(Prefix, Or)
  324    ).
  325expand_fuzzy(case(String), Or) :-
  326    !,
  327    (   atom(String)
  328    ->  token_index(Map),
  329        rdf_keys_in_literal_map(Map, case(String), Tokens),
  330        list_to_or(Tokens, case(String), Or)
  331    ;   expand_fuzzy(String, Or)
  332    ).
  333expand_fuzzy(or(A0, B0), E) :-
  334    !,
  335    expand_fuzzy(A0, A),
  336    expand_fuzzy(B0, B),
  337    simplify(or(A,B), E).
  338expand_fuzzy(and(A0, B0), E) :-
  339    !,
  340    expand_fuzzy(A0, A),
  341    expand_fuzzy(B0, B),
  342    simplify(and(A,B), E).
  343expand_fuzzy(not(A0), not(A)) :-
  344    !,
  345    expand_fuzzy(A0, A).
  346expand_fuzzy(between(Low, High), Or) :-
  347    !,
  348    token_index(Map),
  349    rdf_keys_in_literal_map(Map, between(Low, High), Tokens),
  350    list_to_or(Tokens, between(Low, High), Or).
  351expand_fuzzy(le(High), Or) :-
  352    !,
  353    token_index(Map),
  354    rdf_keys_in_literal_map(Map, le(High), Tokens),
  355    list_to_or(Tokens, le(High), Or).
  356expand_fuzzy(ge(Low), Or) :-
  357    !,
  358    token_index(Map),
  359    rdf_keys_in_literal_map(Map, ge(Low), Tokens),
  360    list_to_or(Tokens, ge(Low), Or).
  361expand_fuzzy(Token, Result) :-
  362    atomic(Token),
  363    !,
  364    (   rdf_stopgap_token(Token)
  365    ->  Result = @(stopgap)
  366    ;   Result = Token
  367    ).
  368expand_fuzzy(Token, _) :-
  369    throw(error(type_error(Token, boolean_expression), _)).
  370
  371simplify(Expr0, Expr) :-
  372    simple(Expr0, Expr),
  373    !.
  374simplify(Expr, Expr).
  375
  376simple(and(@(false), _), @(false)).
  377simple(and(_, @(false)), @(false)).
  378simple(and(@(stopgap), Token), Token).
  379simple(and(Token, @(stopgap)), Token).
  380simple(or(@(false), X), X).
  381simple(or(X, @(false)), X).
  382simple(or(@(stopgap), Token), Token).
  383simple(or(Token, @(stopgap)), Token).
  384
  385
  386list_to_or([], _, @(false)) :- !.
  387list_to_or([X], How, One) :-
  388    !,
  389    tag(How, X, One).
  390list_to_or([H0|T0], How, or(H, T)) :-
  391    tag(How, H0, H),
  392    list_to_or(T0, How, T).
  393
  394tag(sounds(X),    Y, sounds(X,Y)).
  395tag(stem(X),      Y, stem(X,Y)).
  396tag(prefix(X),    Y, prefix(X,Y)).
  397tag(case(X),      Y, case(X,Y)).
  398tag(between(L,H), Y, between(L,H,Y)).
  399tag(ge(L),        Y, ge(L,Y)).
  400tag(le(H),        Y, le(H,Y)).
  401
  402untag(sounds(_,Y),    Y).
  403untag(stem(_,Y),      Y).
  404untag(prefix(_,Y),    Y).
  405untag(case(_,Y),      Y).
  406untag(between(_,_,Y), Y).
  407untag(le(_,Y),        Y).
  408untag(ge(_,Y),        Y).
  409
  410untag(sounds(X,Y),    sounds(X),    Y).
  411untag(stem(X,Y),      stem(X),      Y).
  412untag(prefix(X,Y),    prefix(X),    Y).
  413untag(case(X,Y),      case(X),      Y).
  414untag(between(L,H,Y), between(L,H), Y).
  415untag(ge(L,Y),        ge(L),        Y).
  416untag(le(H,Y),        le(H),        Y).

nnf(+Formula, -NNF): Rewrite to Negative Normal Form, meaning negations only appear around literals.

  424nnf(not(not(A0)), A) :-
  425    !,
  426    nnf(A0, A).
  427nnf(not(and(A0,B0)), or(A,B)) :-
  428    !,
  429    nnf(not(A0), A),
  430    nnf(not(B0), B).
  431nnf(not(or(A0,B0)), and(A,B)) :-
  432    !,
  433    nnf(not(A0), A),
  434    nnf(not(B0), B).
  435nnf(A, A).

dnf(+NNF, -DNF): Convert a formula in NNF to Disjunctive Normal Form (DNF)

  442dnf(or(A0,B0), or(A, B)) :-
  443    !,
  444    dnf(A0, A),
  445    dnf(B0, B).
  446dnf(and(A0,B0), DNF):-
  447    !,
  448    dnf(A0, A1),
  449    dnf(B0, B1),
  450    dnf1(and(A1,B1), DNF).
  451dnf(DNF, DNF).
  452
  453dnf1(and(A0, or(B,C)), or(P,Q)) :-
  454    !,
  455    dnf1(and(A0,B), P),
  456    dnf1(and(A0,C), Q).
  457dnf1(and(or(B,C), A0), or(P,Q)) :-
  458    !,
  459    dnf1(and(A0,B), P),
  460    dnf1(and(A0,C), Q).
  461dnf1(DNF, DNF).
  462
  463
  464                 /*******************************
  465                 *          TOKEN INDEX         *
  466                 *******************************/

token_index(-Map): Get the index of tokens. If not present, create one from the current database. Once created, the map is kept up-to-date using a monitor hook.

  474token_index(Map) :-
  475    literal_map(token, Map),
  476    !,
  477    wait_for_map(token).
  478token_index(Map) :-
  479    rdf_new_literal_map(Map),
  480    assert(literal_map(token, Map)),
  481    register_token_updater,
  482    message_queue_create(Queue),
  483    assert(map_building(token, Queue)),
  484    thread_create(make_literal_index(Queue), _,
  485                  [ alias('__rdf_tokenizer'),
  486                    detached(true)
  487                  ]),
  488    wait_for_map(token).
  489
  490register_token_updater :-
  491    Monitor = [ reset,
  492                new_literal,
  493                old_literal
  494              ],
  495    (   setting(index(default))
  496    ->  create_update_literal_thread(1),
  497        rdf_monitor(thread_monitor_literal, Monitor)
  498    ;   setting(index(thread(N)))
  499    ->  create_update_literal_thread(N),
  500        rdf_monitor(thread_monitor_literal, Monitor)
  501    ;   rdf_monitor(monitor_literal, Monitor)
  502    ).
  503
  504make_literal_index(Queue) :-
  505    call_cleanup(
  506        make_literal_index,
  507        ( message_queue_destroy(Queue),
  508          retractall(map_building(token, _)))).

make_literal_index: Create the initial literal index.

  514make_literal_index :-
  515    setting(index_threads(N)),
  516    !,
  517    threaded_literal_index(N),
  518    verbose('~N', []).
  519make_literal_index :-
  520    current_prolog_flag(cpu_count, X),
  521    threaded_literal_index(X),
  522    verbose('~N', []).
  523
  524threaded_literal_index(N) :-
  525    N > 1,
  526    !,
  527    message_queue_create(Q, [max_size(1000)]),
  528    create_index_threads(N, Q, Ids),
  529    forall(rdf_current_literal(Literal),
  530           thread_send_message(Q, Literal)),
  531    forall(between(1, N, _),
  532           thread_send_message(Q, done(true))),
  533    maplist(thread_join, Ids, _).
  534threaded_literal_index(_) :-
  535    forall(rdf_current_literal(Literal),
  536           register_literal(Literal)).
  537
  538create_index_threads(N, Q, [Id|T]) :-
  539    N > 0,
  540    !,
  541    thread_create(index_worker(Q), Id, []),
  542    N2 is N - 1,
  543    create_index_threads(N2, Q, T).
  544create_index_threads(_, _, []) :- !.
  545
  546index_worker(Queue) :-
  547    repeat,
  548        thread_get_message(Queue, Msg),
  549        work(Msg).
  550
  551work(done(true)) :- !.
  552work(Literal) :-
  553    register_literal(Literal),
  554    fail.

clean_token_index: Clean after a reset.

  561clean_token_index :-
  562    forall(literal_map(_, Map),
  563           rdf_reset_literal_map(Map)),
  564    retractall(stopgap(_)).

rdf_delete_literal_index(+Type): Fully delete a literal index

  570rdf_delete_literal_index(Type) :-
  571    must_be(atom, Type),
  572    (   retract(literal_map(Type, Map))
  573    ->  rdf_reset_literal_map(Map)          % destroy is unsafe
  574    ).
  575
  576                 /*******************************
  577                 *        THREADED UPDATE       *
  578                 *******************************/

create_update_literal_thread(+Threads): Setup literal monitoring using threads. While loading databases through rdf_attach_db/2 from rdf_persistency.pl, most of the time is spent updating the literal token database. While loading the RDF triples, most of the time is spend in updating the AVL tree holding the literals. Updating the token index hangs on updating the AVL trees holding the tokens. Both tasks however can run concurrently.

  590create_update_literal_thread(Threads) :-
  591    message_queue_create(_,
  592                         [ alias(rdf_literal_monitor_queue),
  593                           max_size(50000)
  594                         ]),
  595    forall(between(1, Threads, _),
  596           create_index_worker(initial)).
  597
  598:- dynamic
  599    index_worker_id/1,
  600    extra_worker_count/1.  601
  602create_index_worker(Status) :-
  603    (   retract(index_worker_id(Id0))
  604    ->  true
  605    ;   Id0 = 1
  606    ),
  607    succ(Id0, Id1),
  608    assertz(index_worker_id(Id1)),
  609    atom_concat(rdf_literal_monitor_, Id0, Alias),
  610    inc_extra_worker_count(Status),
  611    thread_create(monitor_literals(Status), _,
  612                  [ alias(Alias)
  613                  ]).
  614
  615monitor_literals(initial) :-
  616    set_prolog_flag(agc_margin, 0), % we don't create garbage
  617    repeat,
  618        thread_get_message(rdf_literal_monitor_queue, Literal),
  619        register_literal(Literal),
  620    fail.
  621monitor_literals(extra) :-
  622    set_prolog_flag(agc_margin, 0),
  623    repeat,
  624        (   thread_get_message(rdf_literal_monitor_queue, Literal,
  625                               [ timeout(1)
  626                               ])
  627        ->  register_literal(Literal),
  628            fail
  629        ;   !
  630        ),
  631    with_mutex(create_index_worker, dec_extra_worker_count),
  632    thread_self(Me),
  633    thread_detach(Me).
  634
  635thread_monitor_literal(new_literal(Literal)) :-
  636    !,
  637    thread_send_message(rdf_literal_monitor_queue, Literal).
  638thread_monitor_literal(Action) :-
  639    !,
  640    monitor_literal(Action).

check_index_workers(+Queue, +Keys): Increase the number of workers indexing literals sent to Queue if the queue gets overful.

  647check_index_workers(Alias, Keys) :-
  648    max_extra_workers(Max),
  649    Max > 0,
  650    message_queue_property(Queue, alias(Alias)),
  651    message_queue_property(Queue, size(Size)),
  652    Size > 10000,
  653    \+ ( extra_worker_count(Extra),
  654         Extra >= Max
  655       ),
  656    !,
  657    debug(rdf_litindex,
  658          'Creating extra literal indexer (Queue=~D, Keys=~D)',
  659          [Size, Keys]),
  660    with_mutex(create_index_worker, create_index_worker(extra)).
  661check_index_workers(_, _).
  662
  663inc_extra_worker_count(extra) :-
  664    !,
  665    (   retract(extra_worker_count(C0))
  666    ->  C is C0+1
  667    ;   C = 1
  668    ),
  669    asserta(extra_worker_count(C)).
  670inc_extra_worker_count(_).
  671
  672dec_extra_worker_count :-
  673    retract(extra_worker_count(C0)),
  674    !,
  675    C is C0-1,
  676    asserta(extra_worker_count(C)).
  677dec_extra_worker_count.
  678
  679max_extra_workers(Max) :-
  680    current_prolog_flag(cpu_count, Count),
  681    Max is Count//2.
  682
  683
  684                 /*******************************
  685                 *       MONITORED UPDATE       *
  686                 *******************************/
  687
  688monitor_literal(new_literal(Literal)) :-
  689    register_literal(Literal).
  690monitor_literal(old_literal(Literal)) :-
  691    unregister_literal(Literal).
  692monitor_literal(transaction(begin, reset)) :-
  693    rdf_monitor(monitor_literal, [-old_literal]),
  694    clean_token_index.
  695monitor_literal(transaction(end, reset)) :-
  696    rdf_monitor(monitor_literal, [+old_literal]).

register_literal(+Literal): Associate the tokens of a literal with the literal itself.

  702register_literal(Literal) :-
  703    (   rdf_tokenize_literal(Literal, Tokens0)
  704    ->  sort(Tokens0, Tokens),
  705        text_of(Literal, Lang, Text),
  706        literal_map(token, Map),
  707        add_tokens(Tokens, Lang, Text, Map)
  708    ;   true
  709    ).
  710
  711add_tokens([], _, _, _).
  712add_tokens([H|T], Lang, Literal, Map) :-
  713    rdf_insert_literal_map(Map, H, Literal, Keys),
  714    (   var(Keys)
  715    ->  (   rdf_keys_in_literal_map(Map, key(H), Count),
  716            setting(stopgap_threshold(Threshold)),
  717            Count > Threshold
  718        ->  assert(stopgap(H)),
  719            rdf_delete_literal_map(Map, H)
  720        ;   true
  721        )
  722    ;   forall(new_token(H, Lang), true),
  723        (   Keys mod 1000 =:= 0
  724        ->  progress(Map, 'Tokens'),
  725            (   Keys mod 10000 =:= 0
  726            ->  check_index_workers(rdf_literal_monitor_queue, Keys)
  727            ;   true
  728            )
  729        ;   true
  730        )
  731    ),
  732    add_tokens(T, Lang, Literal, Map).

unregister_literal(+Literal): Literal is removed from the database. As we abstract from lang and type qualifiers we first have to check this is the last one that is destroyed.

  741unregister_literal(Literal) :-
  742    text_of(Literal, _Lang, Text),
  743    (   rdf(_,_,literal(Text))
  744    ->  true                        % still something left
  745    ;   rdf_tokenize_literal(Literal, Tokens0),
  746        sort(Tokens0, Tokens),
  747        literal_map(token, Map),
  748        del_tokens(Tokens, Text, Map)
  749    ).
  750
  751del_tokens([], _, _).
  752del_tokens([H|T], Literal, Map) :-
  753    rdf_delete_literal_map(Map, H, Literal),
  754    del_tokens(T, Literal, Map).

rdf_tokenize_literal(+Literal, -Tokens) is semidet: Tokenize a literal. We make this hookable as tokenization is generally domain dependent.

  762rdf_tokenize_literal(Literal, Tokens) :-
  763    tokenization(Literal, Tokens),
  764    !.               % Hook
  765rdf_tokenize_literal(Literal, Tokens) :-
  766    text_of(Literal, _Lang, Text),
  767    atom(Text),
  768    tokenize_atom(Text, Tokens0),
  769    select_tokens(Tokens0, Tokens).
  770
  771select_tokens([], []).
  772select_tokens([H|T0], T) :-
  773    (   exclude_from_index(token, H)
  774    ->  select_tokens(T0, T)
  775    ;   number(H)
  776    ->  (   integer(H),
  777            between(-1073741824, 1073741823, H)
  778        ->  T = [H|T1],
  779            select_tokens(T0, T1)
  780        ;   select_tokens(T0, T)
  781        )
  782    ;   atom_length(H, 1)
  783    ->  select_tokens(T0, T)
  784    ;   default_stopgap(H)
  785    ->  select_tokens(T0, T)
  786    ;   stopgap(H)
  787    ->  select_tokens(T0, T)
  788    ;   T = [H|T1],
  789        select_tokens(T0, T1)
  790    ).

rdf_stopgap_token(-Token) is nondet

True when Token is a stopgap token. Currently, this implies one of:

exclude_from_index(token, Token) is true
default_stopgap(Token) is true
Token is an atom of length 1
Token was added to the dynamic stopgap token set because it appeared in more than stopgap_threshold literals.

  803rdf_stopgap_token(Token) :-
  804    (   var(Token)
  805    ->  rdf_stopgap_token2(Token)
  806    ;   rdf_stopgap_token2(Token), !
  807    ).
  808
  809rdf_stopgap_token2(Token) :-
  810    exclude_from_index(token, Token).
  811rdf_stopgap_token2(Token) :-
  812    default_stopgap(Token).
  813rdf_stopgap_token2(Token) :-
  814    atom(Token),
  815    atom_length(Token, 1).
  816rdf_stopgap_token2(Token) :-
  817    stopgap(Token).

default_stopgap(?Token): Tokens we do not wish to index, as they creat huge amounts of data with little or no value. Is there a more general way to describe this? Experience shows that simply word count is not a good criterium as it often rules out popular domain terms.

  826default_stopgap(and).
  827default_stopgap(an).
  828default_stopgap(or).
  829default_stopgap(of).
  830default_stopgap(on).
  831default_stopgap(in).
  832default_stopgap(this).
  833default_stopgap(the).

text_of(+LiteralArg, -Lang, -Text) is semidet: Get the textual or (integer) numerical information from a literal value. Lang is the language to use for stemming. Currently we use English for untyped plain literals or literals typed xsd:string. Formally, these should not be tokenized, but a lot of data out there does not tag strings with their language.

  844text_of(type(xsd:string, Text), en, Text) :- !.
  845text_of(type(_, Text), -, Text) :- !.
  846text_of(lang(Lang, Text), Lang, Text) :- !.
  847text_of(Text, en, Text) :- atom(Text), !.
  848text_of(Text, -, Text) :- integer(Text).
  849
  850
  851                 /*******************************
  852                 *         STEM INDEX           *
  853                 *******************************/

stem_index(-Map) is det: Get the stemming literal index. This index is created on demand. If some thread is creating the index, other threads wait for its completion.

  861stem_index(Map) :-
  862    literal_map(stem, Map),
  863    !,
  864    wait_for_map(stem).
  865stem_index(Map) :-
  866    rdf_new_literal_map(Map),
  867    assert(literal_map(stem, Map)),
  868    assert((new_token(Token, Lang) :- add_stem(Token, Lang, Map))),
  869    message_queue_create(Queue),
  870    assert(map_building(stem, Queue)),
  871    thread_create(fill_stem_index(Map, Queue), _,
  872                  [ alias('__rdf_stemmer'),
  873                    detached(true)
  874                  ]),
  875    wait_for_map(stem).
  876
  877wait_for_map(MapName) :-
  878    (   map_building(MapName, Queue)
  879    ->  catch(thread_get_message(Queue, _), _, true),
  880        wait_for_map(MapName)
  881    ;   true
  882    ).
  883
  884fill_stem_index(StemMap, Queue) :-
  885    call_cleanup(
  886        forall(rdf_current_literal(Literal),
  887               stem_literal_tokens(Literal, StemMap)),
  888        ( message_queue_destroy(Queue),
  889          retractall(map_building(stem, _)))).
  890
  891stem_literal_tokens(Literal, StemMap) :-
  892    rdf_tokenize_literal(Literal, Tokens),
  893    !,
  894    sort(Tokens, Tokens1),
  895    text_of(Literal, Lang, _Text),
  896    insert_tokens_stem(Tokens1, Lang, StemMap).
  897stem_literal_tokens(_,_).
  898
  899insert_tokens_stem([], _, _).
  900insert_tokens_stem([Token|T], Lang, Map) :-
  901    (   atom(Token)
  902    ->  (   stem(Token, Lang, Stem)
  903        ->  rdf_insert_literal_map(Map, Stem, Token, Keys),
  904            (   integer(Keys),
  905                Keys mod 1000 =:= 0
  906            ->  progress(Map, 'Stem')
  907            ;   true
  908            )
  909        ;   true
  910        )
  911    ;   true
  912    ),
  913    insert_tokens_stem(T, Lang, Map).
  914
  915
  916add_stem(Token, Lang, Map) :-
  917    stem(Lang, Token, Stem),
  918    rdf_insert_literal_map(Map, Stem, Token, _).
  919
  920stem(Token, LangSpec, Stem) :-
  921    main_lang(LangSpec, Lang),
  922    downcase_atom(Token, Lower),
  923    catch(snowball(Lang, Lower, Stem), _, fail).
  924
  925main_lang(LangSpec, Lang) :-
  926    sub_atom(LangSpec, Before, _, _, -),
  927    !,
  928    sub_atom(LangSpec, 0, Before, _, Lang).
  929main_lang(LangSpec, Lang) :-
  930    downcase_atom(LangSpec, Lang).
  931
  932
  933                 /*******************************
  934                 *        METAPHONE INDEX       *
  935                 *******************************/
  936
  937
  938metaphone_index(Map) :-
  939    literal_map(metaphone, Map),
  940    !,
  941    wait_for_map(metaphone).
  942metaphone_index(Map) :-
  943    rdf_new_literal_map(Map),
  944    assert(literal_map(metaphone, Map)),
  945    assert((new_token(Token, Lang) :- add_metaphone(Token, Lang, Map))),
  946    message_queue_create(Queue),
  947    assert(map_building(metaphone, Queue)),
  948    thread_create(fill_metaphone_index(Map, Queue), _,
  949                  [ alias('__rdf_metaphone_indexer'),
  950                    detached(true)
  951                  ]),
  952    wait_for_map(metaphone).
  953
  954fill_metaphone_index(MetaphoneMap, Queue) :-
  955    call_cleanup(
  956        fill_metaphone_index(MetaphoneMap),
  957        ( message_queue_destroy(Queue),
  958          retractall(map_building(metaphone, _)))).
  959
  960fill_metaphone_index(MetaphoneMap) :-
  961    token_index(TokenMap),
  962    rdf_keys_in_literal_map(TokenMap, all, Tokens),
  963    metaphone(Tokens, MetaphoneMap).
  964
  965metaphone([], _).
  966metaphone([Token|T], Map) :-
  967    (   atom(Token),
  968        double_metaphone(Token, SoundEx)
  969    ->  rdf_insert_literal_map(Map, SoundEx, Token, Keys),
  970        (   integer(Keys),
  971            Keys mod 1000 =:= 0
  972        ->  progress(Map, 'Metaphone')
  973        ;   true
  974        )
  975    ;   true
  976    ),
  977    metaphone(T, Map).
  978
  979
  980add_metaphone(Token, _Lang, Map) :-
  981    atom(Token),
  982    !,
  983    double_metaphone(Token, SoundEx),
  984    rdf_insert_literal_map(Map, SoundEx, Token).
  985add_metaphone(_, _, _).

rdf_literal_index(+Type, -Index) is det

True when Index is a literal map containing the index of Type. Type is one of:

token: Tokens are basically words of literal values. See rdf_tokenize_literal/2. The token map maps tokens to full literal texts.
stem: Index of stemmed tokens. If the language is available, the tokens are stemmed using the matching snowball stemmer. The stem map maps stemmed to full tokens.
metaphone: Phonetic index of tokens. The metaphone map maps phonetic keys to tokens.

 1004rdf_literal_index(token, Map) :-
 1005    !,
 1006    token_index(Map).
 1007rdf_literal_index(stem, Map) :-
 1008    !,
 1009    stem_index(Map).
 1010rdf_literal_index(metaphone, Map) :-
 1011    !,
 1012    metaphone_index(Map).
 1013rdf_literal_index(Type, _Map) :-
 1014    domain_error(literal_index, Type).
 1015
 1016
 1017                 /*******************************
 1018                 *             UTIL             *
 1019                 *******************************/
 1020
 1021verbose(Fmt, Args) :-
 1022    setting(verbose(true)),
 1023    !,
 1024    format(user_error, Fmt, Args).
 1025verbose(_, _).
 1026
 1027progress(Map, Which) :-
 1028    setting(verbose(true)),
 1029    !,
 1030    rdf_statistics_literal_map(Map, size(Keys, Values)),
 1031    format(user_error,
 1032           '\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+',
 1033           [Which, Keys, Values]).
 1034progress(_,_)