:- set_prolog_flag(stack_limit, 12 000 000 000).
% :- set_prolog_flag(stack_limit, 18 000 000 000).  % try this

% if library(lib) is missing, install via pack_install(lib).
%
:- use_module(library(lib)).
:- lib(debug_call).  % debuc/1,3

% external code, lib knowns how to deal with these (will install if missing)
:- lib(mtx).
% :- lib(bio_db).
:- lib(os_lib).
:- lib(by_unix).
:- lib(options).
:- lib(stoics_lib:url_file/3).
:- lib(stoics_lib:message_report/3).
:- lib(stoics_lib:portray_clauses/2).
% also sets lib alias to that dir
:- ensure_loaded( '../../lib/bio_db_build_aliases' ).  % /1.

% load necessary data that has already been generated
% :- ensure_loaded(unip:bio_db_build_downloads('unip/maps/map_unip_mouse_ensp_unip')).
% :- ensure_loaded(unip:bio_db_build_downloads('unip/maps/map_unip_mouse_mgim_unip')).
% :- ensure_loaded(mgim:bio_db_build_downloads('mgim/maps/map_mgim_mouse_mgim_unip')).
% :- ensure_loaded(mgim:bio_db_build_downloads('mgim/maps/map_mgim_mouse_mgim_symb')).

% local libs & sources
:- lib(link_to_bio_sub/3).
:- lib(bio_db_dnt_times/3).
:- lib(bio_db_add_infos/1).                  % bio_db_add_infos_to/2.
:- lib(build_dnload_loc/3).
:- lib(portray_informed_clauses/4).
:- lib(url_file_local_date_mirror/3).
:- lib(std_graphs_strg_auto_version/1).
:- lib(bio_db_string_version_base_name/5).   % uses bio_db_source_url/3

:- debuc(by_unix).
:- debuc(std_graphs_strg). % fixme:

std_mouse_graphs_strg_defaults( Args, Defs ) :-
               Defs = [  db(strg),
                         debug(true),
                         debug_fetch(true),
                         debug_url(false),
                         iactive(true),
                         org(mouse),
                         relation(links)
                         | T
                      ],
    ( std_graphs_strg_auto_version(Vers,Args) ->       % let options/2 do the erroring
                                                       % because user might provide it
        T = [string_version(Vers)]
        ;
        T = []   
    ).

% last good one: std_graphs_string( '10' ).  2016/09/08
% last good one: std_graphs_string( '10.5' ).  2018/03/30

/** std_mouse_graphs_strg( Opts ).

Mouse graphs for STRING protein protein interactions.

Depends on std_maps_mgim std_maps_unip.

Opts
  * db(Db=strg)
    source database
  * debug(Dbg=true)
    informational, progress messages
  * debug_fetch(Fbg=false)
    whether to debug the fetching of the url
  * debug_url(Ubg=false)
    whether to debug the concatenation of the url (via bio_db_source_url/3)
  * iactive(Iact=true)
    whether the session is interactive, otherwise wget gets --no-verbose
  * links_stem(Ltem='protein.links.v')
    stem for the filename of the remote links file
  * org(Org=mouse)
    organism
  * relation(Rel=links)
    relation of STRING we are interested in (bio_db_string_version_base_name/5)
  * string_version(Vers)
    default is collected by visiting the STRING web-page

==
?- std_mouse_graphs_strg([]).
==

@author nicos angelopoulos
@version  0:2 2022/12/27,  
@see https://string-db.org
@tbd this is closely related to the human pred, we should factor the common things out

*/
std_mouse_graphs_strg( Args ) :-
    Self = std_mouse_graphs_strg,
    options_append( Self, Args, Opts ),
    bio_db_build_aliases( Opts ),
    options( string_version(VersionPrv), Opts ),
    % load necessary data that has already been generated
    ensure_loaded(unip:bio_db_build_downloads('unip/maps/unip_musm_ensp_unip')),
    ensure_loaded(unip:bio_db_build_downloads('unip/maps/unip_musm_mgim_unip')),
    ensure_loaded(mgim:bio_db_build_downloads('mgim/maps/mgim_musm_mgim_unip')),
    ensure_loaded(mgim:bio_db_build_downloads('mgim/maps/mgim_musm_mgim_symb')),
    ( number(VersionPrv) -> atom_number(Version,VersionPrv); Version = VersionPrv ),
    % ensure_loaded( bio_db_build_aliases ),
    debuc( Self, 'Version: ~w', Version ),
    % std_graphs_string_version_base_name( Version, Bname, From ),
    bio_db_string_version_base_name( Version, _VersD, RemBname, SrcUrl, Opts ),
    debuc( Self, 'Remote base name: ~w', RemBname ),
    % absolute_file_name( bio_db_build_downloads(strg), Parent ),
    % os_path( Parent, VersD, DnlD ),
    % os_make_path( DnlD, debug(true) ),
    build_dnload_loc( Self, DnlD, Opts ),
    debuc( Self, 'Downloading from: ~p', SrcUrl ),
    url_file_local_date_mirror( SrcUrl, DnlD, [dnld_file(Bname),iface(wget)|Opts] ),
    debuc( Self, 'Basename to work on: ~p', [Bname] ),
    working_directory( Here, DnlD ),
    @ gunzip( -k, -f, Bname ),  % keeps .gz file
    % @ gunzip( '9606.protein.links.v10.txt.gz' ),
    % Edge = edge_strg_mouse,
    EnspPn = strg_musm_edge_ensp,
    file_name_extension( TxtF, gz, Bname ),
    debuc( Self, 'Directory: ~p', [DnlD] ),
    Mess1 = 'Converting string file: ~p, to Prolog',
    debuc( Self, Mess1, [TxtF] ),
    MtxOpts = [ csv_read(separator(0' )), predicate_name(EnspPn),
                rows_transform(maplist(user:de_mouse)), header_remove(true) 
              ],
    mtx_prolog( TxtF, File, MtxOpts ),
    debuc( Self, 'Wrote on file: ~p', [File] ),
    delete_file( TxtF ),
    % @ rm( -rf, graphs ), don't do that ! there are now multiple downloads from string..
    os_make_path( graphs, debug(true) ),
    % Trg = 'graphs/edge_strg_mouse.pl',
    os_dir_stem_ext( graphs, EnspPn, pl, EnspRelF ),
    @ rm( -f, EnspRelF ),
    @ mv( File, EnspRelF ),
    mouse_strg_symbolise_edges( Self, EnspPn, EnspRelF, UnoSymbEdges ),
    sort( UnoSymbEdges, SymbEdges ),
    length( SymbEdges, SymbEdgesLen ),
    debuc( Self, 'unique symbol edges (mouse): ~w', [SymbEdgesLen] ),
    EdgeSymbsF = 'graphs/strg_musm_edge_symb.pl',
    bio_db_dnt_times( Bname, DnDt, _EndDt ),
    EdgeSymbsInfos = [ source-SrcUrl, datetime-DnDt, header-header('Symbol','Symbol',weight),
                       data_types-data_types(atom,atom,integer)
                     ],
    portray_informed_clauses( SymbEdges, EdgeSymbsInfos, EdgeSymbsF, [] ),
    % SymbOpts = [source(From),datetime(DnDt),header(row('MGI_Symbol','MGI_Symbol',weight))],
    % bio_db_add_infos_to( SymbOpts, EdgeSymbsF ),
    debuc( Self, wrote, EdgeSymbsF ),
    MousOpts = [ source(SrcUrl), datetime(DnDt),
                  header(row('Ensembl_Protein','Ensembl_Protein',weight))
                ],
    debuc( Self, task(stop), infosise(copy_stream(EnspRelF )) ),
    bio_db_add_infos_to( MousOpts, EnspRelF ),
    debuc( Self, task(stop), infosise(copy_stream) ),
    link_to_bio_sub( strg, [EnspRelF,EdgeSymbsF], [org(mouse),type(graphs)] ),
    working_directory( _, Here ).

% At 13:34:57 on 10th of Jun 2023 starting task: symbolise(streamed).
% At 13:37:29 on 10th of Jun 2023 stop task: symbolise(streamed).
% 
mouse_strg_symbolise_edges( Self, EnspPn, EnspRelF, Edges ) :-
     open( EnspRelF, read, InS ),
     read( InS, Term ),
     debuc( Self, task(start), symbolise(streamed) ),
     mouse_strg_symbolise_edges_stream( Term, EnspPn, InS, Edges ),
     debuc( Self, task(stop), symbolise(streamed) ),
     close( InS ).

mouse_strg_symbolise_edges_stream( end_of_file, _Pn, _InS, Edges ) :-
     !,
     [] = Edges.
mouse_strg_symbolise_edges_stream( InTerm, Pn, InS, Edges ) :-
     ( functor(InTerm,Pn,3) ->
               arg( 1, InTerm, EnsP1 ),
               arg( 2, InTerm, EnsP2 ),
               arg( 3, InTerm, W     ),
               ( (ensp_mouse_symb( EnsP1, Symb1 ),
                  ensp_mouse_symb( EnsP2, Symb2 )) ->
                         sort_four( Symb1, Symb2, SymbA, SymbB ),
                         [strg_musm_edge_symb(SymbA,SymbB,W)|TEdges] = Edges
                         ;
                         TEdges = Edges
               )
               ;
               throw(rogue_ensp_to_symb_term(InTerm))
     ),
     read( InS, NxtTerm ),
     mouse_strg_symbolise_edges_stream( NxtTerm, Pn, InS, TEdges ).


ensp_mouse_symb( EnsP, Symb ) :-   % fixme: make sure the cut is green ! 
    unip:unip_musm_ensp_unip( EnsP, Unip ),
    (   unip:unip_musm_mgim_unip(Mgim,Unip)
        ; 
        mgim:mgim_musm_mgim_unip(Mgim,Unip)
    ),
    mgim:mgim_musm_mgim_symb( Mgim, Symb ),
    !.

sort_four( X, Y, A, B ) :-
    Y @< X,
    !,
    A = Y, B = X.
sort_four( A, B, A, B ).

de_mouse( row(MousEnsP1,MousEnsP2,WAtm), row(EnsP1,EnsP2,W) ) :-
    atom_concat( '10090.', EnsP1, MousEnsP1 ),
    atom_concat( '10090.', EnsP2, MousEnsP2 ),
    ( number(WAtm) -> W = WAtm; atom_number(WAtm,W) ),
    !.
de_mouse( Row, _ ) :-
    debuc( _, 'Failed to translate row: ~w', Row ),
    abort.