View source with raw comments or as raw

    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2022, VU University Amsterdam
    7                              CWI, Amsterdam,
    8                              SWI-Prolog Solutions b.v.
    9    All rights reserved.
   10
   11    Redistribution and use in source and binary forms, with or without
   12    modification, are permitted provided that the following conditions
   13    are met:
   14
   15    1. Redistributions of source code must retain the above copyright
   16       notice, this list of conditions and the following disclaimer.
   17
   18    2. Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in
   20       the documentation and/or other materials provided with the
   21       distribution.
   22
   23    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   24    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   25    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   26    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   27    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   28    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   29    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   30    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   31    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   33    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   34    POSSIBILITY OF SUCH DAMAGE.
   35*/
   36
   37:- module(csv,
   38          [ csv//1,                     % +Rows
   39            csv//2,                     % +Rows, +Options
   40
   41            csv_read_file/2,            % +File, -Data
   42            csv_read_file/3,            % +File, -Data, +Options
   43            csv_read_stream/3,          % +Stream, -Data, +Options
   44
   45            csv_read_file_row/3,        % +File, -Row, +Options
   46            csv_read_row/3,		% +Stream, -Row, +CompiledOptions
   47            csv_options/2,		% -Compiled, +Options
   48
   49            csv_write_file/2,           % +File, +Data
   50            csv_write_file/3,           % +File, +Data, +Options
   51            csv_write_stream/3          % +Stream, +Data, +Options
   52          ]).   53:- use_module(library(record),[(record)/1, op(_,_,record)]).   54
   55:- autoload(library(apply),[maplist/2]).   56:- use_module(library(debug),[debug/3]).   57:- autoload(library(error),[must_be/2,domain_error/2]).   58:- autoload(library(lists),[append/3]).   59:- autoload(library(option),[option/2,select_option/4]).   60:- autoload(library(pure_input),
   61	    [phrase_from_file/3,phrase_from_stream/2]).   62:- autoload(library(readutil),[read_line_to_codes/2]).   63:- autoload(library(dcg/basics),[string//1,eos//0]).

Process CSV (Comma-Separated Values) data

This library parses and generates CSV data. CSV data is represented in Prolog as a list of rows. Each row is a compound term, where all rows have the same name and arity.

See also: - RFC 4180 */
To be done: - Implement immediate assert of the data to avoid possible stack overflows.; - Writing creates an intermediate code-list, possibly overflowing resources. This waits for pure output!

   79:- predicate_options(csv//2, 2,
   80                     [ separator(code),         % must be code
   81                       strip(boolean),
   82                       ignore_quotes(boolean),
   83                       convert(boolean),
   84                       case(oneof([down,preserve,up])),
   85                       functor(atom),
   86                       arity(-nonneg),          % actually ?nonneg
   87                       match_arity(boolean)
   88                     ]).   89:- predicate_options(csv_read_file/3, 3,
   90                     [ pass_to(csv//2, 2),
   91                       pass_to(phrase_from_file/3, 3)
   92                     ]).   93:- predicate_options(csv_read_file_row/3, 3,
   94                     [ line(-integer),
   95                       pass_to(csv//2, 2),
   96                       pass_to(open/4, 4)
   97                     ]).   98:- predicate_options(csv_write_file/3, 3,
   99                     [ pass_to(csv//2, 2),
  100                       pass_to(open/4, 4)
  101                     ]).  102:- predicate_options(csv_write_stream/3, 3,
  103                     [ pass_to(csv//2, 2)
  104                     ]).  105
  106
  107:- record
  108    csv_options(separator:integer=0',,
  109                strip:boolean=false,
  110                ignore_quotes:boolean=false,
  111                convert:boolean=true,
  112                case:oneof([down,preserve,up])=preserve,
  113                functor:atom=row,
  114                arity:integer,
  115                match_arity:boolean=true,
  116                skip_header:atom).

csv_read_file(+File, -Rows) is det

csv_read_file(+File, -Rows, +Options) is det

Read a CSV file into a list of rows. Each row is a Prolog term with the same arity. Options is handed to csv//2. Remaining options are processed by phrase_from_file/3. The default separator depends on the file name extension and is \t for .tsv files and , otherwise.

Suppose we want to create a predicate table/6 from a CSV file that we know contains 6 fields per record. This can be done using the code below. Without the option arity(6), this would generate a predicate table/N, where N is the number of fields per record in the data.

?- csv_read_file(File, Rows, [functor(table), arity(6)]),
   maplist(assert, Rows).

  140csv_read_file(File, Rows) :-
  141    csv_read_file(File, Rows, []).
  142
  143csv_read_file(File, Rows, Options) :-
  144    default_separator(File, Options, Options1),
  145    make_csv_options(Options1, Record, RestOptions),
  146    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
  147
  148
  149default_separator(File, Options0, Options) :-
  150    (   option(separator(_), Options0)
  151    ->  Options = Options0
  152    ;   file_name_extension(_, Ext0, File),
  153        downcase_atom(Ext0, Ext),
  154        ext_separator(Ext, Sep)
  155    ->  Options = [separator(Sep)|Options0]
  156    ;   Options = Options0
  157    ).
  158
  159ext_separator(csv, 0',).
  160ext_separator(tsv, 0'\t).

csv_read_stream(+Stream, -Rows, +Options) is det: Read CSV data from Stream. See also csv_read_row/3.

  167csv_read_stream(Stream, Rows, Options) :-
  168    make_csv_options(Options, Record, _),
  169    phrase_from_stream(csv_roptions(Rows, Record), Stream).

csv(?Rows)// is det

csv(?Rows, +Options)// is det

Prolog DCG to `read/write' CSV data. Options:

separator(+Code): The comma-separator. Must be a character code. Default is (of course) the comma. Character codes can be specified using the 0' notation. E.g., using separator(0';) parses a semicolon separated file.
ignore_quotes(+Boolean): If true (default false), threat double quotes as a normal character.
strip(+Boolean): If true (default false), strip leading and trailing blank space. RFC4180 says that blank space is part of the data.
skip_header(+CommentLead): Skip leading lines that start with CommentLead. There is no standard for comments in CSV files, but some CSV files have a header where each line starts with #. After skipping comment lines this option causes csv//2 to skip empty lines. Note that an empty line may not contain white space characters (space or tab) as these may provide valid data.
convert(+Boolean): If true (default), use name/2 on the field data. This translates the field into a number if possible.
case(+Action): If down, downcase atomic values. If up, upcase them and if preserve (default), do not change the case.
functor(+Atom): Functor to use for creating row terms. Default is row.
arity(?Arity): Number of fields in each row. This predicate raises a domain_error(row_arity(Expected), Found) if a row is found with different arity.
match_arity(+Boolean): If false (default true), do not reject CSV files where lines provide a varying number of fields (columns). This can be a work-around to use some incorrect CSV files.

  221csv(Rows) -->
  222    csv(Rows, []).
  223
  224csv(Rows, Options) -->
  225    { make_csv_options(Options, Record, _) },
  226    csv_roptions(Rows, Record).
  227
  228csv_roptions(Rows, Record) -->
  229    { ground(Rows) },
  230    !,
  231    emit_csv(Rows, Record).
  232csv_roptions(Rows, Record) -->
  233    skip_header(Record),
  234    csv_data(Rows, Record).
  235
  236skip_header(Options) -->
  237    { csv_options_skip_header(Options, CommentStart),
  238      nonvar(CommentStart),
  239      atom_codes(CommentStart, Codes)
  240    },
  241    !,
  242    skip_header_lines(Codes),
  243    skip_blank_lines.
  244skip_header(_) -->
  245    [].
  246
  247skip_header_lines(CommentStart) -->
  248    string(CommentStart),
  249    !,
  250    (   string(_Comment),
  251        end_of_record
  252    ->  skip_header_lines(CommentStart)
  253    ).
  254skip_header_lines(_) -->
  255    [].
  256
  257skip_blank_lines -->
  258    eos,
  259    !.
  260skip_blank_lines -->
  261    end_of_record,
  262    !,
  263    skip_blank_lines.
  264skip_blank_lines -->
  265    [].
  266
  267csv_data([], _) -->
  268    eos,
  269    !.
  270csv_data([Row|More], Options) -->
  271    row(Row, Options),
  272    !,
  273    { debug(csv, 'Row: ~p', [Row]) },
  274    csv_data(More, Options).
  275
  276
  277row(Row, Options) -->
  278    fields(Fields, Options),
  279    { csv_options_functor(Options, Functor),
  280      Row =.. [Functor|Fields],
  281      functor(Row, _, Arity),
  282      check_arity(Options, Arity)
  283    }.
  284
  285check_arity(Options, Arity) :-
  286    csv_options_arity(Options, Arity),
  287    !.
  288check_arity(Options, _) :-
  289    csv_options_match_arity(Options, false),
  290    !.
  291check_arity(Options, Arity) :-
  292    csv_options_arity(Options, Expected),
  293    domain_error(row_arity(Expected), Arity).
  294
  295fields([F|T], Options) -->
  296    field(F, Options),
  297    (   separator(Options)
  298    ->  fields(T, Options)
  299    ;   end_of_record
  300    ->  { T = [] }
  301    ).
  302
  303field(Value, Options) -->
  304    "\"",
  305    { csv_options_ignore_quotes(Options, false) },
  306    !,
  307    string_codes(Codes),
  308    { make_value(Codes, Value, Options) }.
  309field(Value, Options) -->
  310    { csv_options_strip(Options, true) },
  311    !,
  312    stripped_field(Value, Options).
  313field(Value, Options) -->
  314    { csv_options_separator(Options, Sep) },
  315    field_codes(Codes, Sep),
  316    { make_value(Codes, Value, Options) }.
  317
  318
  319stripped_field(Value, Options) -->
  320    ws,
  321    (   "\"",
  322        { csv_options_strip(Options, false) }
  323    ->  string_codes(Codes),
  324        ws
  325    ;   { csv_options_separator(Options, Sep) },
  326        field_codes(Codes0, Sep),
  327        { strip_trailing_ws(Codes0, Codes) }
  328    ),
  329    { make_value(Codes, Value, Options) }.
  330
  331ws --> " ", !, ws.
  332ws --> "\t", !, ws.
  333ws --> "".
  334
  335strip_trailing_ws(List, Stripped) :-
  336    append(Stripped, WS, List),
  337    all_ws(WS).
  338
  339all_ws([]).
  340all_ws([32|T]) :- all_ws(T).
  341all_ws([9|T]) :- all_ws(T).

string_codes(-Codes): Process a double-quotes string where the quote is escaped by doubling it. Eats the terminating double-quote.

  349string_codes(List) -->
  350    [H],
  351    (   { H == 0'" }
  352    ->  (   "\""
  353        ->  { List = [H|T] },
  354            string_codes(T)
  355        ;   { List = [] }
  356        )
  357    ;   { List = [H|T] },
  358        string_codes(T)
  359    ).
  360
  361field_codes([], Sep), [Sep] --> [Sep], !.
  362field_codes([], _), "\n" --> "\r\n", !.
  363field_codes([], _), "\n" --> "\n", !.
  364field_codes([], _), "\n" --> "\r", !.
  365field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
  366field_codes([], _) --> [].              % unterminated last record

make_value(+Codes, -Value, +Options) is det: Convert a list of character codes to the actual value, depending on Options.

  373make_value(Codes, Value, Options) :-
  374    csv_options_convert(Options, Convert),
  375    csv_options_case(Options, Case),
  376    make_value(Convert, Case, Codes, Value).
  377
  378make_value(true, preserve, Codes, Value) :-
  379    !,
  380    name(Value, Codes).
  381make_value(true, Case, Codes, Value) :-
  382    !,
  383    (   number_string(Value, Codes)
  384    ->  true
  385    ;   make_value(false, Case, Codes, Value)
  386    ).
  387make_value(false, preserve, Codes, Value) :-
  388    !,
  389    atom_codes(Value, Codes).
  390make_value(false, down, Codes, Value) :-
  391    !,
  392    string_codes(String, Codes),
  393    downcase_atom(String, Value).
  394make_value(false, up, Codes, Value) :-
  395    string_codes(String, Codes),
  396    upcase_atom(String, Value).
  397
  398separator(Options) -->
  399    { csv_options_separator(Options, Sep) },
  400    [Sep].
  401
  402end_of_record --> "\n".			% Unix files
  403end_of_record --> "\r\n".               % DOS files
  404end_of_record --> "\r".                 % MacOS files
  405end_of_record --> eos.                  % unterminated last record

csv_read_file_row(+File, -Row, +Options) is nondet

True when Row is a row in File. First unifies Row with the first row in File. Backtracking yields the second, ... row. This interface is an alternative to csv_read_file/3 that avoids loading all rows in memory. Note that this interface does not guarantee that all rows in File have the same arity.

In addition to the options of csv_read_file/3, this predicate processes the option:

line(-Line): Line is unified with the 1-based line-number from which Row is read. Note that Line is not the physical line, but rather the logical record number.

  424csv_read_file_row(File, Row, Options) :-
  425    default_separator(File, Options, Options1),
  426    make_csv_options(Options1, RecordOptions, Options2),
  427    select_option(line(Line), Options2, RestOptions, _),
  428    setup_call_cleanup(
  429        open(File, read, Stream, RestOptions),
  430        csv_read_stream_row(Stream, Row, Line, RecordOptions),
  431        close(Stream)).
  432
  433csv_read_stream_row(Stream, Row, Line, Options) :-
  434    between(1, infinite, Line),
  435    (   csv_read_row(Stream, Row0, Options),
  436        Row0 \== end_of_file
  437    ->  Row = Row0
  438    ;   !,
  439        fail
  440    ).

csv_read_row(+Stream, -Row, +CompiledOptions) is det: Read the next CSV record from Stream and unify the result with Row. CompiledOptions is created from options defined for csv//2 using csv_options/2. Row is unified with end_of_file upon reaching the end of the input.

  450csv_read_row(Stream, Row, _Record) :-
  451    at_end_of_stream(Stream),
  452    !,
  453    Row = end_of_file.
  454csv_read_row(Stream, Row, Record) :-
  455    read_lines_to_codes(Stream, Codes, Record, even),
  456    phrase(row(Row0, Record), Codes),
  457    !,
  458    Row = Row0.
  459
  460read_lines_to_codes(Stream, Codes, Options, QuoteQuantity) :-
  461    read_line_to_codes(Stream, Codes0),
  462    Codes0 \== end_of_file,
  463    (   (   csv_options_ignore_quotes(Options, true)
  464        ;   check_quotes(Codes0, QuoteQuantity, even)
  465        )
  466    ->  Codes = Codes0
  467    ;   append(Codes0, [0'\n|Tail], Codes),
  468        read_lines_to_codes(Stream, Tail, Options, odd)
  469    ).
  470
  471check_quotes([], QuoteQuantity, QuoteQuantity) :-
  472    !.
  473check_quotes([0'"|T], odd, Result) :-
  474    !,
  475    check_quotes(T, even, Result).
  476check_quotes([0'"|T], even, Result) :-
  477    !,
  478    check_quotes(T, odd, Result).
  479check_quotes([_|T], QuoteQuantity, Result) :-
  480    check_quotes(T, QuoteQuantity, Result).

csv_options(-Compiled, +Options) is det: Compiled is the compiled representation of the CSV processing options as they may be passed into csv//2, etc. This predicate is used in combination with csv_read_row/3 to avoid repeated processing of the options.

  490csv_options(Compiled, Options) :-
  491    make_csv_options(Options, Compiled, _Ignored).
  492
  493
  494                /*******************************
  495                *             OUTPUT           *
  496                *******************************/

csv_write_file(+File, +Data) is det
csv_write_file(+File, +Data, +Options) is det: Write a list of Prolog terms to a CSV file. Options are given to csv//2. Remaining options are given to open/4. The default separator depends on the file name extension and is \t for .tsv files and , otherwise.

  506csv_write_file(File, Data) :-
  507    csv_write_file(File, Data, []).
  508
  509csv_write_file(File, Data, Options) :-
  510    must_be(list, Data),
  511    default_separator(File, Options, Options1),
  512    make_csv_options(Options1, OptionsRecord, RestOptions),
  513    setup_call_cleanup(
  514        open(File, write, Out, RestOptions),
  515        maplist(csv_write_row(Out, OptionsRecord), Data),
  516        close(Out)).
  517
  518csv_write_row(Out, OptionsRecord, Row) :-
  519    phrase(emit_row(Row, OptionsRecord), String),
  520    format(Out, '~s', [String]).
  521
  522emit_csv([], _) --> [].
  523emit_csv([H|T], Options) -->
  524    emit_row(H, Options),
  525    emit_csv(T, Options).
  526
  527emit_row(Row, Options) -->
  528    { Row =.. [_|Fields] },
  529    emit_fields(Fields, Options),
  530    "\r\n".                                     % RFC 4180 demands \r\n
  531
  532emit_fields([], _) -->
  533    "".
  534emit_fields([H|T], Options) -->
  535    emit_field(H, Options),
  536    (   { T == [] }
  537        ->  []
  538        ;   { csv_options_separator(Options, Sep) },
  539        [Sep],
  540        emit_fields(T, Options)
  541    ).
  542
  543emit_field(H, Options) -->
  544    { (   atom(H)
  545      ->  atom_codes(H, Codes)
  546      ;   string(H)
  547      ->  string_codes(H, Codes)
  548      )
  549    },
  550    !,
  551    (   { needs_quotes(H, Options) }
  552    ->  "\"", emit_string(Codes), "\""
  553    ;   emit_codes(Codes)
  554    ).
  555emit_field([], _) -->
  556    !,
  557    { atom_codes('[]', Codes) },
  558    emit_codes(Codes).
  559emit_field(H, _) -->
  560    { number_codes(H,Codes) },
  561    emit_codes(Codes).
  562
  563needs_quotes(Atom, _) :-
  564    sub_atom(Atom, _, _, _, '"'),
  565    !.
  566needs_quotes(Atom, _) :-
  567    sub_atom(Atom, _, _, _, '\n'),
  568    !.
  569needs_quotes(Atom, _) :-
  570    sub_atom(Atom, _, _, _, '\r'),
  571    !.
  572needs_quotes(Atom, Options) :-
  573    csv_options_separator(Options, Sep),
  574    char_code(Char, Sep),
  575    sub_atom(Atom, _, _, _, Char),
  576    !.
  577
  578emit_string([]) --> "".
  579emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
  580emit_string([H|T]) --> [H], emit_string(T).
  581
  582emit_codes([]) --> "".
  583emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
  584emit_codes([H|T]) --> [H], emit_codes(T).

csv_write_stream(+Stream, +Data, +Options) is det

Write the rows in Data to Stream. This is similar to csv_write_file/3, but can deal with data that is produced incrementally. The example below saves all answers from the predicate data/3 to File.

save_data(File) :-
   setup_call_cleanup(
       open(File, write, Out),
       forall(data(C1,C2,C3),
              csv_write_stream(Out, [row(C1,C2,C3)], [])),
       close(Out)).

  603csv_write_stream(Stream, Data, Options) :-
  604    must_be(list, Data),
  605    make_csv_options(Options, OptionsRecord, _),
  606    maplist(csv_write_row(Stream, OptionsRecord), Data)