1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2009-2025, VU University Amsterdam 7 SWI-Prolog Solutions b.v. 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 17 2. Redistributions in binary form must reproduce the above copyright 18 notice, this list of conditions and the following disclaimer in 19 the documentation and/or other materials provided with the 20 distribution. 21 22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 POSSIBILITY OF SUCH DAMAGE. 34*/ 35 36:- module(uri, 37 [ uri_components/2, % ?URI, ?Components 38 uri_data/3, % ?Field, +Components, ?Data 39 uri_data/4, % +Field, +Components, -Data, -New 40 uri_edit/3, % +Actions,+URI0,-URI 41 42 uri_normalized/2, % +URI, -NormalizedURI 43 iri_normalized/2, % +IRI, -NormalizedIRI 44 uri_normalized_iri/2, % +URI, -NormalizedIRI 45 uri_normalized/3, % +URI, +Base, -NormalizedURI 46 iri_normalized/3, % +IRI, +Base, -NormalizedIRI 47 uri_normalized_iri/3, % +URI, +Base, -NormalizedIRI 48 uri_resolve/3, % +URI, +Base, -AbsURI 49 uri_is_global/1, % +URI 50 uri_query_components/2, % ?QueryString, ?NameValueList 51 uri_authority_components/2, % ?Authority, ?Components 52 uri_authority_data/3, % ?Field, ?Components, ?Data 53 % Encoding 54 uri_encoded/3, % +Component, ?Value, ?Encoded 55 uri_file_name/2, % ?URI, ?Path 56 uri_iri/2 % ?URI, ?IRI 57 ]). 58:- autoload(library(error), [domain_error/2]). 59:- if(exists_source(library(socket))). 60:- autoload(library(socket), [gethostname/1]). 61:- endif. 62 63:- use_foreign_library(foreign(uri)). 64 65/** <module> Process URIs 66 67This library provides high-performance C-based primitives for 68manipulating URIs. We decided for a C-based implementation for the much 69better performance on raw character manipulation. Notably, URI handling 70primitives are used in time-critical parts of RDF processing. This 71implementation is based on RFC-3986: 72 73 http://labs.apache.org/webarch/uri/rfc/rfc3986.html 74 75The URI processing in this library is rather liberal. That is, we break 76URIs according to the rules, but we do not validate that the components 77are valid. Also, percent-decoding for IRIs is liberal. It first tries 78UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim. 79 80Earlier experience has shown that strict enforcement of the URI syntax 81results in many errors that are accepted by many other web-document 82processing tools. 83 84This library provides explicit support for URN URIs. 85*/ 86 87%! uri_components(+URI, -Components) is det. 88%! uri_components(-URI, +Components) is det. 89% 90% Break a URI into its 5 basic components according to the 91% RFC-3986 regular expression: 92% 93% ``` 94% ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 95% 12 3 4 5 6 7 8 9 96% ``` 97% 98% If the schema is `urn`, it is broken into its schema, NSI 99% (_Namespace Identifier_) and NSS (_Namespace Specific String_). 100% 101% @arg Components is a one of 102% - uri_components(Scheme, Authority, Path, Search, Fragment) 103% If a URI is _parsed_, i.e., using mode (+,-), components that 104% are not found are left _uninstantiated_ (variable). See 105% uri_data/3 for accessing this structure. 106% - urn_components(Scheme, NID, NSS, Search, Fragment) 107% Here Scheme is always `urn`. Otherwise the same comments 108% as for uri_components/5 apply. 109 110%! uri_data(+Field, +Components, -Data) is semidet. 111%! uri_data(-Field, +Components, -Data) is nondet. 112% 113% Provide access the `uri_components` or `urn_components` structure. 114% The Field `scheme` is always present. Other fields depend on the 115% scheme. The `urn` scheme provides `nid` and `nss`. Other schems 116% provide `authority`, `path`, `search` and `fragment` 117 118uri_data(Field, Components, Data), var(Field) => 119 uri_data_(Field, Components, Data). 120uri_data(Field, Components, Data), atom(Field) => 121 uri_data_(Field, Components, Data), 122 !. 123 124uri_data_(scheme, uri_components(S, _, _, _, _), S). 125uri_data_(authority, uri_components(_, A, _, _, _), A). 126uri_data_(path, uri_components(_, _, P, _, _), P). 127uri_data_(search, uri_components(_, _, _, S, _), S). 128uri_data_(fragment, uri_components(_, _, _, _, F), F). 129uri_data_(scheme, urn_components(S, _, _, _, _), S). 130uri_data_(nid, urn_components(_, I, _, _, _), I). 131uri_data_(nss, urn_components(_, _, N, _, _), N). 132uri_data_(search, urn_components(_, _, _, S, _), S). 133uri_data_(fragment, urn_components(_, _, _, _, F), F). 134 135%! uri_data(+Field, +Components, +Data, -NewComponents) is det. 136% 137% NewComponents is the same as Components with Field set to Data. 138% 139% @error domain_error(uri_field, Field) if Field is invalid. 140% @error instantiation_error if Field or Components is unbound. 141 142uri_data(scheme, uri_components(_, A, P, Q, F), S, New) => 143 New = uri_components(S, A, P, Q, F). 144uri_data(scheme, urn_components(_, I, N), S, New) => 145 New = urn_components(S, I, N). 146uri_data(authority, uri_components(S, _, P, Q, F), A, New) => 147 New = uri_components(S, A, P, Q, F). 148uri_data(path, uri_components(S, A, _, Q, F), P, New) => 149 New = uri_components(S, A, P, Q, F). 150uri_data(search, uri_components(S, A, P, _, F), Q, New) => 151 New = uri_components(S, A, P, Q, F). 152uri_data(search, urn_components(S, A, P, _, F), Q, New) => 153 New = urn_components(S, A, P, Q, F). 154uri_data(fragment, uri_components(S, A, P, Q, _), F, New) => 155 New = uri_components(S, A, P, Q, F). 156uri_data(fragment, urn_components(S, A, P, Q, _), F, New) => 157 New = urn_components(S, A, P, Q, F). 158uri_data(nid, urn_components(S, _, N), I, New) => 159 New = urn_components(S, I, N). 160uri_data(nss, urn_components(S, I, _), N, New) => 161 New = urn_components(S, I, N). 162uri_data(_, Components, _N, _New), var(Components) => 163 instantiation_error(Components). 164uri_data(Field, _, _N, _New) => 165 domain_error(uri_field, Field). 166 167%! uri_normalized(+URI, -NormalizedURI:atom) is det. 168% 169% NormalizedURI is the normalized form of URI. Normalization is 170% syntactic and involves the following steps: 171% 172% * 6.2.2.1. Case Normalization 173% * 6.2.2.2. Percent-Encoding Normalization 174% * 6.2.2.3. Path Segment Normalization 175 176%! iri_normalized(+IRI, -NormalizedIRI) is det. 177% 178% NormalizedIRI is the normalized form of IRI. Normalization is 179% syntactic and involves the following steps: 180% 181% * 6.2.2.1. Case Normalization 182% * 6.2.2.3. Path Segment Normalization 183% 184% @see This is similar to uri_normalized/2, but does not do 185% normalization of %-escapes. 186 187%! uri_normalized_iri(+URI, -NormalizedIRI) is det. 188% 189% As uri_normalized/2, but percent-encoding is translated into IRI 190% Unicode characters. The translation is liberal: valid UTF-8 191% sequences of %-encoded bytes are mapped to the Unicode 192% character. Other %XX-sequences are mapped to the corresponding 193% ISO-Latin-1 character and sole % characters are left untouched. 194% 195% @see uri_iri/2. 196 197 198%! uri_is_global(+URI) is semidet. 199% 200% True if URI has a scheme. The semantics is the same as the code 201% below, but the implementation is more efficient as it does not need 202% to parse the other components, nor needs to bind the scheme. The 203% condition to demand a scheme of more than one character is added to 204% avoid confusion with DOS path names. 205% 206% ``` 207% uri_is_global(URI) :- 208% uri_components(URI, Components), 209% uri_data(scheme, Components, Scheme), 210% nonvar(Scheme), 211% atom_length(Scheme, Len), 212% Len > 1. 213% ``` 214 215%! uri_resolve(+URI, +Base, -GlobalURI:atom) is det. 216% 217% Resolve a possibly local URI relative to Base. This implements 218% http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform 219 220%! uri_normalized(+URI, +Base, -NormalizedGlobalURI:atom) is det. 221% 222% NormalizedGlobalURI is the normalized global version of URI. 223% Behaves as if defined by: 224% 225% ``` 226% uri_normalized(URI, Base, NormalizedGlobalURI) :- 227% uri_resolve(URI, Base, GlobalURI), 228% uri_normalized(GlobalURI, NormalizedGlobalURI). 229% ``` 230 231%! iri_normalized(+IRI, +Base, -NormalizedGlobalIRI:atom) is det. 232% 233% NormalizedGlobalIRI is the normalized global version of IRI. 234% This is similar to uri_normalized/3, but does not do %-escape 235% normalization. 236 237%! uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI:atom) is det. 238% 239% NormalizedGlobalIRI is the normalized global IRI of URI. Behaves 240% as if defined by: 241% 242% ``` 243% uri_normalized(URI, Base, NormalizedGlobalIRI) :- 244% uri_resolve(URI, Base, GlobalURI), 245% uri_normalized_iri(GlobalURI, NormalizedGlobalIRI). 246% ``` 247 248%! uri_query_components(+String, -Query:atom) is det. 249%! uri_query_components(-String, +Query) is det. 250% 251% Perform encoding and decoding of an URI query string. Query is a 252% list of fully decoded (Unicode) Name=Value pairs. In mode (-,+), 253% query elements of the forms Name(Value) and Name-Value are also 254% accepted to enhance interoperability with the option and pairs 255% libraries. E.g. 256% 257% ``` 258% ?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']). 259% QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'. 260% 261% ?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q). 262% Q = [a=b, c='d+w', n='VU Amsterdam']. 263% ``` 264 265 266%! uri_authority_components(+Authority, -Components) is det. 267%! uri_authority_components(-Authority:atom, +Components) is det. 268% 269% Break-down the authority component of a URI. The fields of the 270% structure Components can be accessed using uri_authority_data/3. 271% This predicate deals with IPv6 addresses written as ``[ip]``, 272% returning the _ip_ as `host`, without the enclosing `[]`. When 273% constructing an authority string and the host contains `:`, the 274% host is embraced in `[]`. If `[]` is not used correctly, the 275% behavior should be considered poorly defined. If there is no 276% balancing `]` or the host part does not end with `]`, these 277% characters are considered normal characters and part of the 278% (invalid) host name. 279 280 281%! uri_authority_data(+Field, ?Components, ?Data) is semidet. 282% 283% Provide access the uri_authority structure. Defined field-names 284% are: `user`, `password`, `host` and `port` 285 (user, uri_authority(U, _, _, _), U). 287uri_authority_data(password, uri_authority(_, P, _, _), P). 288uri_authority_data(host, uri_authority(_, _, H, _), H). 289uri_authority_data(port, uri_authority(_, _, _, P), P). 290 291 292%! uri_encoded(+Component, +Value, -Encoded:atom) is det. 293%! uri_encoded(+Component, -Value:atom, +Encoded) is det. 294% 295% Encoded is the URI encoding for Value. When encoding 296% (Value->Encoded), Component specifies the URI component where the 297% value is used. It is one of `query_value`, `fragment`, `path` or 298% `segment`. Besides alphanumerical characters, the following 299% characters are passed verbatim (the set is split in logical groups 300% according to RFC3986). 301% 302% $ query_value, fragment : 303% "-._~" | "!$'()*,;" | "@" | "/?" 304% $ path : 305% "-._~" | "!$&'()*,;=" | "@" | "/" 306% $ segment : 307% "-._~" | "!$&'()*,;=" | "@" 308 309%! uri_iri(+URI, -IRI:atom) is det. 310%! uri_iri(-URI:atom, +IRI) is det. 311% 312% Convert between a URI, encoded in US-ASCII and an IRI. An IRI is 313% a fully expanded Unicode string. Unicode strings are first 314% encoded into UTF-8, after which %-encoding takes place. 315% 316% @error syntax_error(Culprit) in mode (+,-) if URI is not a 317% legally percent-encoded UTF-8 string. 318 319 320%! uri_file_name(+URI, -FileName:atom) is semidet. 321%! uri_file_name(-URI:atom, +FileName) is det. 322% 323% Convert between a URI and a local file_name. This protocol is 324% covered by RFC 1738. Please note that file-URIs use _absolute_ 325% paths. The mode (-, +) translates a possible relative path into 326% an absolute one. 327 328uri_file_name(URI, FileName) :- 329 nonvar(URI), 330 !, 331 uri_components(URI, Components), 332 uri_data(scheme, Components, File), File == file, 333 uri_data(authority, Components, Host), 334 my_host(Host), 335 uri_data(path, Components, FileNameEnc), 336 uri_encoded(path, FileName0, FileNameEnc), 337 delete_leading_slash(FileName0, FileName). 338uri_file_name(URI, FileName) :- 339 nonvar(FileName), 340 !, 341 absolute_file_name(FileName, Path0), 342 ensure_leading_slash(Path0, Path), 343 uri_encoded(path, Path, PathEnc), 344 uri_data(scheme, Components, file), 345 uri_data(authority, Components, ''), 346 uri_data(path, Components, PathEnc), 347 uri_components(URI, Components). 348 349my_host('') :- !. 350my_host(localhost) :- !. 351:- if(exists_source(library(socket))). 352my_host(Host) :- 353 gethostname(Host). 354:- endif. 355 356%! ensure_leading_slash(+WinPath, -Path). 357%! delete_leading_slash(+Path, -WinPath). 358% 359% Deal with the fact that absolute paths in Windows start with a 360% drive letter rather than a /. For URIs we need a path that 361% starts with a /. 362 363ensure_leading_slash(Path, SlashPath) :- 364 ( sub_atom(Path, 0, _, _, /) 365 -> SlashPath = Path 366 ; atom_concat(/, Path, SlashPath) 367 ). 368 369:- if(current_prolog_flag(windows, true)). 370delete_leading_slash(Path, WinPath) :- 371 atom_concat(/, WinPath, Path), 372 is_absolute_file_name(WinPath), 373 !. 374:- endif. 375delete_leading_slash(Path, Path). 376 377 378 /******************************* 379 * MODIFYING * 380 *******************************/ 381 382%! uri_edit(+Actions, +URI0, -URI) is det. 383% 384% Modify a URI according to Actions. Actions is either a single 385% action or a (nested) list of actions. Defined primitive actions 386% are: 387% 388% - scheme(+Scheme) 389% Set the Scheme of the URI (typically `http`, `https`, etc.) 390% - user(+User) 391% Add/set the user of the authority component. 392% - password(+Password) 393% Add/set the password of the authority component. 394% - host(+Host) 395% Add/set the host (or ip address) of the authority component. 396% - port(+Port) 397% Add/set the port of the authority component. 398% - path(+Path) 399% Set/extend the `path` component. If Path is not absolute it 400% is taken relative to the path of URI0. 401% - search(+KeyValues) 402% Extend the `Key=Value` pairs of the current search (query) 403% component. New values replace existing values. If KeyValues 404% is written as =(KeyValues) the current search component is 405% ignored. KeyValues is a list, whose elements are one of 406% `Key=Value`, `Key-Value` or `Key(Value)`. 407% - fragment(+Fragment) 408% Set the Fragment of the uri. 409% - nid(+NID) 410% Set the _Namespace Identifier_ for a URN URI. 411% - nss(+NSS) 412% Set the _Namespace Specific String_ for a URN URI. 413% 414% Components can be _removed_ by using a variable as value, except 415% from `path` which can be reset using path(/) and query which can 416% be dropped using query(=([])). 417% 418% @arg URI0 is either a valid uri or a variable to start fresh. 419 420uri_edit(Actions, URI0, URI) :- 421 ( var(URI0) 422 -> URI1 = '/' 423 ; URI1 = URI0 424 ), 425 uri_components(URI1, Comp0), 426 edit_components(Actions, Comp0, Comp), 427 uri_components(URI, Comp). 428 429edit_components([], Comp0, Comp) => 430 Comp = Comp0. 431edit_components([H|T], Comp0, Comp) => 432 edit_components(H, Comp0, Comp1), 433 edit_components(T, Comp1, Comp). 434edit_components(scheme(Scheme), Comp0, Comp) => 435 uri_data(scheme, Comp0, Scheme, Comp). 436edit_components(path(Path), Comp0, Comp) => 437 uri_data(path, Comp0, Path0), 438 ( ( var(Path0) 439 ; Path0 == '' 440 ) 441 -> Path1 = '/' 442 ; Path1 = Path0 443 ), 444 uri_normalized(Path, Path1, Path2), 445 uri_data(path, Comp0, Path2, Comp). 446edit_components(fragment(Fragment), Comp0, Comp) => 447 uri_data(fragment, Comp0, Fragment, Comp). 448edit_components(Authority, Comp0, Comp), 449 authority_field(Authority) => 450 uri_data(authority, Comp0, Auth0), 451 ( var(Auth0) 452 -> true 453 ; uri_authority_components(Auth0, AComp0) 454 ), 455 edit_auth_components(Authority, AComp0, AComp), 456 uri_authority_components(Auth, AComp), 457 uri_data(authority, Comp0, Auth, Comp). 458edit_components(query(Search), Comp0, Comp) => 459 edit_components(search(Search), Comp0, Comp). 460edit_components(search(=(Search)), Comp0, Comp) => 461 uri_query_components(String, Search), 462 uri_data(search, Comp0, String, Comp). 463edit_components(search(Search), Comp0, Comp) => 464 uri_data(search, Comp0, SS0), 465 ( var(SS0) 466 -> Search0 = [] 467 ; uri_query_components(SS0, Search0) 468 ), 469 join_search(Search0, Search, Search1), 470 uri_query_components(SS1, Search1), 471 uri_data(search, Comp0, SS1, Comp). 472edit_components(nid(NID), Comp0, Comp) => 473 uri_data(fragment, Comp0, NID, Comp). 474edit_components(nss(NSS), Comp0, Comp) => 475 uri_data(fragment, Comp0, NSS, Comp). 476edit_components(Other, _, _) => 477 domain_error(uri_edit, Other). 478 (user(_)). 480authority_field(password(_)). 481authority_field(host(_)). 482authority_field(port(_)). 483 484edit_auth_components(user(User), 485 uri_authority(_, Passwd, Host, Port), 486 uri_authority(User, Passwd, Host, Port)). 487edit_auth_components(password(Passwd), 488 uri_authority(User, _, Host, Port), 489 uri_authority(User, Passwd, Host, Port)). 490edit_auth_components(host(Host), 491 uri_authority(User, Passwd, _, Port), 492 uri_authority(User, Passwd, Host, Port)). 493edit_auth_components(port(Port), 494 uri_authority(User, Passwd, Host, _), 495 uri_authority(User, Passwd, Host, Port)). 496 497join_search([], Search, Search). 498join_search([N=_|ST], New, Search) :- 499 ( memberchk(N=_, New) 500 -> true 501 ; functor(T, N, 1), 502 memberchk(T, New) 503 -> true 504 ; memberchk(N-_, New) 505 ), 506 !, 507 join_search(ST, New, Search). 508join_search([H|ST], New, [H|Search]) :- 509 join_search(ST, New, Search). 510 511 512 /******************************* 513 * SANDBOX * 514 *******************************/ 515 516:- multifile sandbox:safe_primitive/1. 517 518sandbox:safe_primitive(uri:uri_components(_,_)). 519sandbox:safe_primitive(uri:uri_normalized(_,_)). 520sandbox:safe_primitive(uri:iri_normalized(_,_)). 521sandbox:safe_primitive(uri:uri_normalized_iri(_,_)). 522sandbox:safe_primitive(uri:uri_normalized(_,_,_)). 523sandbox:safe_primitive(uri:iri_normalized(_,_,_)). 524sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)). 525sandbox:safe_primitive(uri:uri_resolve(_,_,_)). 526sandbox:safe_primitive(uri:uri_is_global(_)). 527sandbox:safe_primitive(uri:uri_query_components(_,_)). 528sandbox:safe_primitive(uri:uri_authority_components(_,_)). 529sandbox:safe_primitive(uri:uri_encoded(_,_,_)). 530sandbox:safe_primitive(uri:uri_iri(_,_))