1/*  Part of SWI-Prolog
    2
    3    Author:        Willem Robert van Hage
    4    E-mail:        W.R.van.Hage@vu.nl
    5    WWW:           http://www.few.vu.nl/~wrvhage
    6    Copyright (c)  2009-2012, Vrije Universiteit Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(space_web_loader,
   36          [ space_load_url/1,      % +URL
   37            space_load_url/2,      % +URL, +Options
   38            space_unload_url/1,    % +URL
   39            space_unload_url/2,    % +URL, +Options
   40            space_crawl_url/1,     % +URL
   41            space_crawl_url/2,     % +URL, +Options
   42            space_uncrawl_url/1,   % +URL
   43            space_uncrawl_url/2    % +URL, +Options
   44          ]).   45
   46:- use_module(library(space/space)).   47:- use_module(library(semweb/rdf_db)).   48:- use_module(library(semweb/rdf_turtle)).   49:- use_module(library(semweb/rdf_http_plugin)).
 space_load_url(+URL) is det
Retrieve RDF over HTTP from a URL, load it in the rdf_db and index all URI-Shape pairs that can be found in it into the default index.
   57space_load_url(URL) :- space_load_url(URL,[]).
 space_load_url(+URL, +Options) is det
Load using space_load_url/1, given extra options.
index(+IndexName)
Index the URI-Shape pairs into index named IndexName.
graph(+Graph)
Store the URI-Shape pairs in the named graph Graph. The pairs are recorded as uri_shape(URI,Shape,Graph).
   70space_load_url(URL, Options) :-
   71    space_setting(rtree_default_index(DefaultIndex)),
   72    option(index(IndexName), Options, DefaultIndex),
   73    (   option(graph(Graph), Options)
   74    ->  rdf_load(URL,Graph)
   75    ;   rdf_load(URL)
   76    ),
   77    Counter = counter(0),
   78    forall(uri_shape(URI, Shape, URL),
   79           (   space_assert(URI, Shape, IndexName),
   80               arg(1, Counter, N0),
   81               N is N0 + 1,
   82               nb_setarg(1, Counter, N)
   83           )),
   84    arg(1, Counter, C),
   85    print_message(informational,space_load_url(C,IndexName)).
 space_unload_url(+URL) is det
Unload the RDF that was fetched from URL and remove all URI-Shape pairs that are contained in it from the default index.
   92space_unload_url(URL) :- space_unload_url(URL,[]).
 space_unload_url(+URL, +Options) is det
Unload the RDF that was fetched from URL and remove all URI-Shape pairs that are contained in it. Accepts extra options:
index(+IndexName)
Remove from the index named IndexName.
graph(+Graph)
Remove the URI-Shape pairs from the named graph Graph.
  105space_unload_url(URL, Options) :-
  106    space_setting(rtree_default_index(DefaultIndex)),
  107    option(index(IndexName), Options, DefaultIndex),
  108    option(graph(Graph), Options, URL),
  109    Counter = counter(0),
  110    forall(uri_shape(URI, Shape, URL),
  111           (   space_retract(URI, Shape, IndexName),
  112               arg(1, Counter, N0),
  113               N is N0 + 1,
  114               nb_setarg(1, Counter, N)
  115           )),
  116    arg(1, Counter, C),
  117    print_message(informational,space_unload_url(C,IndexName)),
  118    rdf_unload(Graph).
  119
  120:- multifile prolog:message//1.  121
  122prolog:message(space_load_url(0,_)) --> [], !.
  123prolog:message(space_load_url(C,IndexName)) -->
  124    [ 'Added ~w URI-Shape ~w to ~w'-[C, P, IndexName] ],
  125    { plural(C,P) }.
  126
  127prolog:message(space_unload_url(0,_)) --> [], !.
  128prolog:message(space_unload_url(C,IndexName)) -->
  129    [ 'Removed ~w URI-Shape ~w from ~w'-[C, P, IndexName] ],
  130    { plural(C,P) }.
  131
  132plural(1,pair) :- !.
  133plural(_,pairs).
  134
  135prolog:message(space_crawl_url(C)) -->
  136    [ 'Crawling ~w'-[C] ].
  137
  138prolog:message(space_uncrawl_url(C)) -->
  139    [ 'Uncrawling ~w'-[C] ].
 link_property(+Property) is det
RDF properties declared a link_property will be traversed by space_crawl_url. link_property is a dynamic property. By default owl:sameAs, skos:exactMatch, and skos:closeMatch are link properties.
  149:- dynamic link_property/1.  150link_property('http://www.w3.org/2002/07/owl#sameAs').
  151link_property('http://www.w3.org/2004/02/skos/core#exactMatch').
  152link_property('http://www.w3.org/2004/02/skos/core#closeMatch').
 space_crawl_url(+URL) is det
Retrieve RDF over HTTP from a URL, load it in the rdf_db and index all URI-Shape pairs that can be found in it into the default index. Also attempt to resolve all URIs that appear as object in a link_property statement downloaded from the URL. Retrieve these URIs and process them in the same way. Iterate this process until there are no new links that have not already been crawled.
  165space_crawl_url(URL) :- space_crawl_url(URL,[]).
 space_crawl_url(+URL, +Options) is det
Crawl using space_crawl_url/1, with additional options.
index(+IndexName)
Index the URI-Shape pairs into index named IndexName.
graph(+Graph)
Store the URI-Shape pairs in the named graph Graph. The pairs are recorded as uri_shape(URI,Shape,Graph).
  178space_crawl_url(URL,Options) :-
  179    with_mutex(message,print_message(informational,space_crawl_url(URL))),
  180    space_load_url(URL,Options),
  181    findall( NewLink, new_link(URL:_,NewLink,_Type), NewLinks ),
  182    forall( member(NL, NewLinks),
  183            thread_create(space_crawl_url(NL,Options),_,[])
  184          ).
 space_uncrawl_url(+URL) is det
Unload the RDF that was fetched from URL and remove all URI-Shape pairs that are contained in it from the default index. Also unload all data that were crawled by iteratively resolving the URIs linked to with a link_property.
  193space_uncrawl_url(URL) :- space_uncrawl_url(URL,[]).
 space_uncrawl_url(+URL, +IndexName) is det
Unload using space_uncrawl_url/1, but remove the URI-Shape pairs from the index named IndexName.
index(+IndexName)
Remove the URI-Shape pairs from index named IndexName.
graph(+Graph)
Remove the URI-Shape pairs from the named graph Graph.
  206space_uncrawl_url(URL,Options) :-
  207    with_mutex(message,print_message(informational,space_uncrawl_url(URL))),
  208    findall( Link, old_link(URL:_,Link,_Type), Links ),
  209    space_unload_url(URL,Options),
  210    forall( member(L, Links),
  211            space_uncrawl_url(L,Options)
  212          ).
  213
  214new_link(FromSource,NewLink,P) :-
  215    link_property(P),
  216    rdf(_,P,NewLink,FromSource),
  217    \+once(rdf(_,_,_,NewLink:_)).
  218
  219old_link(FromSource,Link,P) :-
  220    link_property(P),
  221    rdf(_,P,Link,FromSource),
  222    once(rdf(_,_,_,Link:_))