:- module(parse_annotated_wavfiles, [filter_annotated_wavfiles/3, filter_annotated_wavfiles/4, annotated_wavfiles_to_sents_file/2, annotated_wavfiles_to_sents_file/3, sents_file_to_grammar_probs_data_file/3, parse_annotated_wavfiles/5, parse_annotated_wavfile_line/2, split_annotated_wavfiles_into_training_and_test/4, check_annotated_wavfiles/3] ). %-------------------------------------------------------------------------------------------- :- use_module('$REGULUS/Alterf/Prolog/generic_numbers'). :- use_module('$REGULUS/Alterf/Prolog/classifier_utilities'). :- use_module('$REGULUS/PrologLib/utilities'). :- use_module(library(lists)). :- use_module(library(ordsets)). :- use_module(library(random)). :- use_module(library(system)). %-------------------------------------------------------------------------------------------- /* Routines for processing the test and training data. filter_annotated_wavfiles(+InFile, +FilterFile, +OutFile, +Mode) 1. InFile File of training data in the format defined in classifier_trainer.pl 2. FilterFile File specifying what to filter out. A line in the filter file can be of one of following two form: - ignore_examples_including_tags where is a space-separated list of semantic atoms. - ignore_examples_including_words where is a space-separated list of words An example filter file is home/speech/rialist/checklist/corpora/filter_spec. 3. OutFile File produced by filtering, in same format as InFile. 4. Mode. One of [keep_ignores, discard_ignores] If mode is 'keep_ignores', and the tags are not compatible with the filter spec, replace them with 'ignore' If mode is 'discard_ignores', keep with tags unchanged iff compatible with the filter spec, else discard. ----------------------------------------------------------------------------------------------- annotated_wavfiles_to_sents_file(+InFile, +OutFile, +Mode) 1. InFile Annotated wavfiles file. 2. OutFile File of records in one of forms sent(). [Mode = prolog] [Mode = text] that can be used for EBL training. Form depends on value of Mode. 3. Mode One of [prolog, text] ----------------------------------------------------------------------------------------------- sents_file_to_grammar_probs_data_file(+InFile, +GrammarName, +OutFile) 1. InFile Annotated wavfiles file. 2. GrammarName Prolog atom 3. OutFile File suitable for use in PFSG training. Each line is of form ----------------------------------------------------------------------------------------------- parse_annotated_wavfiles(+InFile, +WavsFile, +TranscriptionsFile, +SentsFile, +PrologFile) Do pre-processing of annotated wavfiles file preparatory to batchrec and Alterf training/testing. 1. InFile Annotated wavfiles file. 2. WavsFile File in format needed for Nuance batchrec 3. TranscriptionsFile File in format needed for Nuance batchrec 4. SentsFile File in format sent(, ). used by Alterf 5. PrologFile File in format wavfile_and_annotations(, , ) used by Alterf ----------------------------------------------------------------------------------------------- parse_annotated_wavfile_line(+Line, -ParsedLine) 1. Line Prolog string representing a line in an Alterf-tagged file. 2. ParsedLine Term of form either - batchrec_command(Command) with Command an atom - annotated_wavfile_line(Wavfile, Annotations, Transcription) with - Wavfile an atom - Annotations a list of Alterf tags - Transcription a list of atoms representing words ----------------------------------------------------------------------------------------------- split_annotated_wavfiles_into_training_and_test(+InFile, +TrainingFile, +TestFile, +ProportionTest) 1. InFile Annotated wavfiles file. 2. TrainingFile Annotated wavfiles file. 3. TestFile Annotated wavfiles file. 4. ProportionTest Real number between 0 and 1. Proportion of data to put in TestFile. ----------------------------------------------------------------------------------------------- check_annotated_wavfiles(+InFile, +OutFile, +ErrorFile) Interactive routine for hand-checking contents of annotated wavfiles file. On each record, the wavfile is played and the transcription shown to the human annotator, who has to approve/disapprove. Correct records are sent to OutFile, incorrect to ErrorFile. 1. InFile Annotated wavfiles file. 2. OutFile Annotated wavfiles file. 3. ErrorFile Annotated wavfiles file. */ filter_annotated_wavfiles(InFile, FilterFile, OutFile) :- filter_annotated_wavfiles(InFile, FilterFile, OutFile, discard_ignores). filter_annotated_wavfiles(InFile, FilterFile, OutFile, Mode) :- read_filter_file(FilterFile, FilterSpec), present_filter_spec(FilterSpec), absolute_file_name(InFile, AbsInFile), absolute_file_name(OutFile, AbsOutFile), format('~N~nFiltering annotated wavfiles file ~w~n', [AbsInFile]), open(InFile, read, SIn), open(OutFile, write, SOut), filter_annotated_wavfiles_stream(SIn, SOut, FilterSpec, Mode, 0-Count, 0-KeptCount), close(SIn), close(SOut), format('~NOutput written to ~w~n', [AbsOutFile]), format('~N~d records, of which ~d kept after filtering~n', [Count, KeptCount]), !. filter_annotated_wavfiles(InFile, FilterFile, _OutFile, _Mode) :- format('~N*** Error: unable to filter ~w using filter spec file ~w~n', [InFile, FilterFile]), fail. present_filter_spec(FilterSpec) :- format('~NFilter spec: ~w~n', [FilterSpec]). read_filter_file(InFile, FilterSpec) :- absolute_file_name(InFile, AbsInFile), format('~N~nReading filtering spec from file ~w~n', [AbsInFile]), read_file_to_atom_list(AbsInFile, FilterFileAtomList), atom_list_to_filter_spec(FilterFileAtomList, FilterSpec), !. read_filter_file(InFile, _FilterSpec) :- format('~N*** Error: unable to parse filter spec file ~w~n', [InFile]), fail. atom_list_to_filter_spec(FilterFileAtomList, FilterSpec) :- atom_list_to_filter_spec1(FilterFileAtomList, FilterFileAtomList1), findall( BadTag, ( member(bad_tags(BadTags0), FilterFileAtomList1), member(BadTag, BadTags0) ), BadTags), findall( BadWord, ( member(bad_words(BadWords0), FilterFileAtomList1), member(BadWord, BadWords0) ), BadWords), FilterSpec = [bad_tags=BadTags, bad_words=BadWords]. atom_list_to_filter_spec1([], []). atom_list_to_filter_spec1([F | R], [F1 | R1]) :- parse_filter_spec_line(F, F1), atom_list_to_filter_spec1(R, R1). parse_filter_spec_line(Atom, Result) :- split_atom_into_words(Atom, Words), ( Words = [] -> Result = null ; Words = [ignore_examples_including_tags | BadTags] -> Result = bad_tags(BadTags) ; Words = [ignore_examples_including_words | BadWords] -> Result = bad_words(BadWords) ), !. parse_filter_spec_line(Atom, _Result) :- format('~N*** Error: bad line in filter spec file: "~w"~n', [Atom]), fail. filter_annotated_wavfiles_stream(SIn, SOut, FilterSpec, Mode, CountIn-CountOut, KeptCountIn-KeptCountOut) :- getline(SIn, Line), filter_annotated_wavfiles_stream1(Line, SIn, SOut, FilterSpec, Mode, CountIn-CountOut, KeptCountIn-KeptCountOut). filter_annotated_wavfiles_stream1(Line, _SIn, _SOut, _FilterSpec, _Mode, CountIn-CountIn, KeptCountIn-KeptCountIn) :- Line = eof, !. filter_annotated_wavfiles_stream1(Line, SIn, SOut, FilterSpec, Mode, CountIn-CountOut, KeptCountIn-KeptCountOut) :- atom_chars(LineAtom, Line), CountNext is CountIn + 1, ( parse_annotated_wavfile_line(Line, ParsedLine) -> filter_annotated_wavfiles_stream2(ParsedLine, LineAtom, SOut, FilterSpec, Mode, KeptCountIn-KeptCountNext) ; format('~N*** Error: unable to parse line ~w in annotated wavfile~n', [LineAtom]), fail ), !, filter_annotated_wavfiles_stream(SIn, SOut, FilterSpec, Mode, CountNext-CountOut, KeptCountNext-KeptCountOut). filter_annotated_wavfiles_stream2(ParsedLine, _LineAtom, _SOut, _FilterSpec, _Mode, KeptCountIn-KeptCountIn) :- ParsedLine = null, !. filter_annotated_wavfiles_stream2(ParsedLine, _LineAtom, SOut, _FilterSpec, _Mode, KeptCountIn-KeptCountOut) :- ParsedLine = batchrec_command(Command), format(SOut, '~N~w~n', [Command]), KeptCountOut is KeptCountIn + 1, !. filter_annotated_wavfiles_stream2(ParsedLine, _LineAtom, SOut, FilterSpec, Mode, KeptCountIn-KeptCountOut) :- ParsedLine = annotated_wavfile_line(Wavfile, Annotations, Transcription), ( annotations_and_transcription_compatible_with_filter_spec(Annotations, Transcription, FilterSpec, Mode, Annotations1) -> KeptCountOut is KeptCountIn + 1, coerce_list_elements_to_atoms_if_necessary(Annotations1, Annotations2), join_with_spaces(Annotations2, AnnotationsAtom), join_with_spaces(Transcription, TranscriptionAtom), format(SOut, '~N~w | ~w | ~w~n', [Wavfile, AnnotationsAtom, TranscriptionAtom]) ; KeptCountOut is KeptCountIn ), !. filter_annotated_wavfiles_stream2(ParsedLine, LineAtom, SOut, FilterSpec, Mode, KeptCounts) :- format('~N*** Error: bad call: ~w~n', [filter_annotated_wavfiles_stream2(ParsedLine, LineAtom, SOut, FilterSpec, Mode, KeptCounts)]), fail. % If mode is 'keep_ignores', and the tags are compatible with the filter spec, keep them annotations_and_transcription_compatible_with_filter_spec(Annotations, Transcription, FilterSpec, keep_ignores, Annotations) :- FilterSpec = [bad_tags=BadTags, bad_words=BadWords], \+ lists_share_atom(Annotations, BadTags), \+ lists_share_atom(Transcription, BadWords), !. % If mode is 'keep_ignores', and the tags are not compatible with the filter spec, replace them with 'ignore' annotations_and_transcription_compatible_with_filter_spec(_Annotations, _Transcription, _FilterSpec, keep_ignores, Annotations1) :- Annotations1 = [ignore], !. % If mode is 'discard_ignores', keep with tags unchanged iff compatible with the filter spec, else discard. annotations_and_transcription_compatible_with_filter_spec(Annotations, Transcription, FilterSpec, discard_ignores, Annotations) :- FilterSpec = [bad_tags=BadTags, bad_words=BadWords], !, \+ lists_share_atom(Annotations, BadTags), \+ lists_share_atom(Transcription, BadWords). annotations_and_transcription_compatible_with_filter_spec(Annotations, Transcription, FilterSpec, Mode, Annotations1) :- format('~N*** Error: bad call: ~w~n', [annotations_and_transcription_compatible_with_filter_spec(Annotations, Transcription, FilterSpec, Mode, Annotations1)]), fail. %-------------------------------------------------------------------------------------------- check_annotated_wavfiles(InFile, OutFile, ErrorFile) :- statistics(runtime, [StartTime, _]), absolute_file_name(InFile, AbsInFile), absolute_file_name(OutFile, AbsOutFile), absolute_file_name(ErrorFile, AbsErrorFile), format('~N~nChecking annotated wavfiles file ~w~n', [AbsInFile]), open(AbsInFile, read, SIn), check_annotated_wavfiles_stream(SIn, OutFile, ErrorFile, 0-Count, 0-KeptCount), close(SIn), format('~N~n--------------------------------------------------------------------~n', []), format('~N~nOutput written to ~w (correct) and ~w (incorrect)~n', [AbsOutFile, AbsErrorFile]), format('~N~d records, of which ~d kept after checking~n', [Count, KeptCount]), statistics(runtime, [EndTime, _]), % Time returned is in milliseconds. TimeTaken is ( float(EndTime) - float(StartTime) ) / 1000.0, ( Count > 0 -> TimePerRecord is TimeTaken / Count ; TimePerRecord = 0.0 ), format('~NElapsed time = ~1f seconds (~1f seconds/record)~n', [TimeTaken, TimePerRecord]), !. check_annotated_wavfiles(InFile, OutFile, ErrorFile) :- format('~N*** Error: bad call: ~w~n', [check_annotated_wavfiles(InFile, OutFile, ErrorFile)]), fail. check_annotated_wavfiles_stream(SIn, OutFile, ErrorFile, CountIn-CountOut, KeptCountIn-KeptCountOut) :- getline(SIn, Line), check_annotated_wavfiles_stream1(Line, SIn, OutFile, ErrorFile, CountIn-CountOut, KeptCountIn-KeptCountOut). check_annotated_wavfiles_stream1(Line, _SIn, _OutFile, _ErrorFile, CountIn-CountIn, KeptCountIn-KeptCountIn) :- Line = eof, !. check_annotated_wavfiles_stream1(Line, SIn, OutFile, ErrorFile, CountIn-CountOut, KeptCountIn-KeptCountOut) :- atom_chars(LineAtom, Line), CountNext is CountIn + 1, ( parse_annotated_wavfile_line(Line, ParsedLine) -> check_annotated_wavfiles_stream2(ParsedLine, LineAtom, OutFile, ErrorFile, CountNext, KeptCountIn-KeptCountNext) ; format('~N*** Error: unable to parse line ~w in annotated wavfile~n', [LineAtom]), fail ), !, check_annotated_wavfiles_stream(SIn, OutFile, ErrorFile, CountNext-CountOut, KeptCountNext-KeptCountOut). check_annotated_wavfiles_stream2(ParsedLine, _LineAtom, _OutFile, _ErrorFile, _Count, KeptCountIn-KeptCountIn) :- ParsedLine = null, !. check_annotated_wavfiles_stream2(ParsedLine, _LineAtom, OutFile, ErrorFile, Count, KeptCountIn-KeptCountOut) :- ParsedLine = annotated_wavfile_line(Wavfile, Annotations, Transcription), coerce_list_elements_to_atoms_if_necessary(Annotations, Annotations2), join_with_spaces(Annotations2, AnnotationsAtom), join_with_spaces(Transcription, TranscriptionAtom), ( wavfile_and_tags_confirmed(Count, Wavfile, AnnotationsAtom, TranscriptionAtom) -> KeptCountOut is KeptCountIn + 1, AppropriateFile = OutFile ; KeptCountOut is KeptCountIn, AppropriateFile = ErrorFile ), open(AppropriateFile, append, S), format(S, '~N~w | ~w | ~w~n', [Wavfile, AnnotationsAtom, TranscriptionAtom]), close(S), !. check_annotated_wavfiles_stream2(ParsedLine, LineAtom, OutFile, ErrorFile, Count, KeptCounts) :- format('~N*** Error: bad call: ~w~n', [check_annotated_wavfiles_stream2(ParsedLine, LineAtom, OutFile, ErrorFile, Count, KeptCounts)]), fail. wavfile_and_tags_confirmed(Count, Wavfile, Annotations, Transcription) :- absolute_file_name(Wavfile, Wavfile1), ( file_exists(Wavfile1) -> true ; format('~N*** Error: file does not exist: ~w~n', [Wavfile1]), fail ), system_on_list([playwav, Wavfile1, '>', 'tmp_output.txt']), format('~N~n~n------------------------------------~n', []), format('~NRecord number ~d~n', [Count]), format('~NTranscription: ~w~n', [Transcription]), format('~N Annotations: ~w~n', [Annotations]), format('~N~nOK? (y, n, r) ', []), getline(user, Line), atom_chars(Atom, Line), !, wavfile_and_tags_confirmed1(Atom, Count, Wavfile, Annotations, Transcription). wavfile_and_tags_confirmed(Count, Wavfile, Annotations, Transcription) :- format('~N*** Error: bad call: ~w~n', [wavfile_and_tags_confirmed(Count, Wavfile, Annotations, Transcription)]), fail. wavfile_and_tags_confirmed1(y, _Count, _Wavfile, _Annotations, _Transcription) :- !. wavfile_and_tags_confirmed1(r, Count, Wavfile, Annotations, Transcription) :- !, wavfile_and_tags_confirmed(Count, Wavfile, Annotations, Transcription). wavfile_and_tags_confirmed1(Other, Count, Wavfile, Annotations, Transcription) :- Other \== n, format('~NPlease answer "y" (yes), "n" (no), "r" (repeat)~n', []), !, wavfile_and_tags_confirmed(Count, Wavfile, Annotations, Transcription). %-------------------------------------------------------------------------------------------- annotated_wavfiles_to_sents_file(InFile, OutFile) :- annotated_wavfiles_to_sents_file(InFile, OutFile, prolog). annotated_wavfiles_to_sents_file(InFile, OutFile, Format) :- \+ member(Format, [prolog, text]), !, format('~N*** Error: bad call: ~w. Last argument must be "prolog" or "text".~n', [annotated_wavfiles_to_sents_file(InFile, OutFile, Format)]), fail. annotated_wavfiles_to_sents_file(InFile, OutFile, Format) :- absolute_file_name(InFile, AbsInFile), absolute_file_name(OutFile, AbsOutFile), format('~N~nExtracting sents file from ~w~n', [AbsInFile]), open(InFile, read, SIn), open(OutFile, write, SOut), annotated_wavfiles_to_sents_file_stream(SIn, SOut, Format, 0-Count), close(SIn), close(SOut), format('~NOutput written to ~w (~d records)~n~n', [AbsOutFile, Count]). annotated_wavfiles_to_sents_file_stream(SIn, SOut, Format, CountIn-CountOut) :- getline(SIn, Line), annotated_wavfiles_to_sents_file_stream1(Line, SIn, SOut, Format, CountIn-CountOut). annotated_wavfiles_to_sents_file_stream1(Line, _SIn, _SOut, _Format, Count-Count) :- Line = eof, !. annotated_wavfiles_to_sents_file_stream1(Line, SIn, SOut, Format, CountIn-CountOut) :- atom_chars(LineAtom, Line), ( parse_annotated_wavfile_line(Line, ParsedLine) -> annotated_wavfiles_to_sents_file_stream2(ParsedLine, LineAtom, SOut, Format, CountIn-CountNext) ; format('~N*** Error: unable to parse line ~w in annotated wavfile~n', [LineAtom]), fail ), !, annotated_wavfiles_to_sents_file_stream(SIn, SOut, Format, CountNext-CountOut). annotated_wavfiles_to_sents_file_stream2(ParsedLine, _Line, _SOut, _Format, Count-Count) :- ParsedLine = null, !. annotated_wavfiles_to_sents_file_stream2(ParsedLine, _Line, _SOut, _Format, Count-Count) :- ParsedLine = batchrec_command(_Command), !. annotated_wavfiles_to_sents_file_stream2(ParsedLine, _Line, _SOut, _Format, CountIn-CountOut) :- ParsedLine = annotated_wavfile_line(_Wavfile, AnnotationsOS, Transcription), ( Transcription = [] ; AnnotationsOS = [ignore] ), CountOut is CountIn, !. annotated_wavfiles_to_sents_file_stream2(ParsedLine, _Line, SOut, Format, CountIn-CountOut) :- ParsedLine = annotated_wavfile_line(_Wavfile, AnnotationsOS, Transcription), join_with_spaces(Transcription, TranscriptionAtom), ( Format = prolog -> alterf_tags_to_grammar_ids_if_possible(AnnotationsOS, GrammarIds), format(SOut, '~N~q.~n', [sent(TranscriptionAtom, GrammarIds)]) ; format(SOut, '~N~w~n', [TranscriptionAtom]) ), CountOut is CountIn + 1, !. annotated_wavfiles_to_sents_file_stream2(_ParsedLine, Line, _STraining, _STest, _ProportionTest) :- format('~N*** Error: unable to process line ~w in annotated wavfile~n', [Line]), fail. alterf_tags_to_grammar_ids_if_possible(AlterfTags, GrammarIds) :- current_predicate(user:alterf_tags_to_grammar_id/2), findall(GrammarId, user:alterf_tags_to_grammar_id(AlterfTags, GrammarId), GrammarIds0), sort(GrammarIds0, GrammarIds), !. alterf_tags_to_grammar_ids_if_possible(AlterfTags, AlterfTags). %-------------------------------------------------------------------------------------------- sents_file_to_grammar_probs_data_file(InFile, BaseGrammarName, OutFile) :- absolute_file_name(InFile, AbsInFile), absolute_file_name(OutFile, AbsOutFile), format('~N~nConverting sents file ~w into grammar probs data file~n', [AbsInFile]), open(AbsInFile, read, SIn), open(AbsOutFile, write, SOut), sents_file_to_grammar_probs_data_stream(SIn, BaseGrammarName, SOut, 0-Count), close(SIn), close(SOut), format('~N~nGrammar probs data file ~w written (~d records)~n', [AbsOutFile, Count]), !. sents_file_to_grammar_probs_data_file(InFile, BaseGrammarName, OutFile) :- format('~N*** Error: bad call: ~w~n', [sents_file_to_grammar_probs_data_file(InFile, BaseGrammarName, OutFile)]), fail. sents_file_to_grammar_probs_data_stream(SIn, BaseGrammarName, SOut, CIn-COut) :- read(SIn, T), sents_file_to_grammar_probs_data_stream1(T, SIn, BaseGrammarName, SOut, CIn-COut). sents_file_to_grammar_probs_data_stream1(end_of_file, _SIn, _BaseGrammarName, _SOut, CIn-CIn) :- !. sents_file_to_grammar_probs_data_stream1(Record, SIn, BaseGrammarName, SOut, CIn-COut) :- functor(Record, sent, N), N >= 1, arg(1, Record, TranscriptionAtom), ( N = 1 -> GrammarIDs = [default] ; arg(2, Record, GrammarIDs) ), sents_file_to_grammar_probs_data_stream2(GrammarIDs, TranscriptionAtom, BaseGrammarName, SOut, CIn-CNext), !, sents_file_to_grammar_probs_data_stream(SIn, BaseGrammarName, SOut, CNext-COut). sents_file_to_grammar_probs_data_stream2([], _TranscriptionAtom, _BaseGrammarName, _SOut, CIn-CIn) :- !. sents_file_to_grammar_probs_data_stream2([GrammarID | GrammarIDs], TranscriptionAtom, BaseGrammarName, SOut, CIn-COut) :- sents_file_to_grammar_probs_data_stream3(GrammarID, TranscriptionAtom, BaseGrammarName, SOut, CIn-CNext), !, sents_file_to_grammar_probs_data_stream2(GrammarIDs, TranscriptionAtom, BaseGrammarName, SOut, CNext-COut). sents_file_to_grammar_probs_data_stream3(GrammarID, TranscriptionAtom, BaseGrammarName, SOut, CIn-COut) :- ( GrammarID = default -> format(SOut, '~N~w ~w~n', [BaseGrammarName, TranscriptionAtom]) ; format(SOut, '~N~w__~w ~w~n', [BaseGrammarName, GrammarID, TranscriptionAtom]) ), COut is CIn + 1, !. %-------------------------------------------------------------------------------------------- split_annotated_wavfiles_into_training_and_test(InFile, TrainingFile, TestFile, ProportionTest) :- open(InFile, read, SIn), open(TrainingFile, write, STraining), open(TestFile, write, STest), split_annotated_wavfiles_into_training_and_test_stream(SIn, STraining, STest, ProportionTest), close(SIn), close(STraining), close(STest). split_annotated_wavfiles_into_training_and_test_stream(SIn, STraining, STest, ProportionTest) :- getline(SIn, Line), split_annotated_wavfiles_into_training_and_test_stream1(Line, SIn, STraining, STest, ProportionTest). split_annotated_wavfiles_into_training_and_test_stream1(Line, _SIn, _STraining, _STest, _ProportionTest) :- Line = eof, !. split_annotated_wavfiles_into_training_and_test_stream1(Line, SIn, STraining, STest, ProportionTest) :- atom_chars(LineAtom, Line), ( parse_annotated_wavfile_line(Line, ParsedLine) -> split_annotated_wavfiles_into_training_and_test_stream2(ParsedLine, LineAtom, STraining, STest, ProportionTest) ; format('~N*** Error: unable to parse line ~w in annotated wavfile~n', [LineAtom]), fail ), !, split_annotated_wavfiles_into_training_and_test_stream(SIn, STraining, STest, ProportionTest). split_annotated_wavfiles_into_training_and_test_stream2(ParsedLine, _Line, _STraining, _STest, _ProportionTest) :- ParsedLine = null, !. split_annotated_wavfiles_into_training_and_test_stream2(ParsedLine, Line, STraining, STest, _ProportionTest) :- ParsedLine = batchrec_command(_Command), !, format(STraining, '~N~w~n', [Line]), format(STest, '~N~w~n', [Line]). split_annotated_wavfiles_into_training_and_test_stream2(ParsedLine, Line, STraining, STest, ProportionTest) :- ParsedLine = annotated_wavfile_line(_Wavfile, _AnnotationsOS, _Transcription), !, random(RandomNumber), ( RandomNumber < ProportionTest -> format(STest, '~N~w~n', [Line]) ; format(STraining, '~N~w~n', [Line]) ). split_annotated_wavfiles_into_training_and_test_stream2(_ParsedLine, Line, _STraining, _STest, _ProportionTest) :- format('~N*** Error: unable to classify line ~w in annotated wavfile~n', [Line]), fail. %-------------------------------------------------------------------------------------------- parse_annotated_wavfiles(InFile, WavsFile, TranscriptionsFile, SentsFile, PrologFile) :- open(InFile, read, SIn), open(WavsFile, write, SOutWavs), open(TranscriptionsFile, write, SOutTranscriptions), open(SentsFile, write, SOutSents), open(PrologFile, write, SOutProlog), parse_annotated_wavfiles_stream([SIn, SOutWavs, SOutTranscriptions, SOutSents, SOutProlog], 0), close(SIn), close(SOutWavs), close(SOutTranscriptions), close(SOutSents), close(SOutProlog). parse_annotated_wavfiles_stream(Streams, Counter) :- Streams = [SIn | _OutStreams], getline(SIn, Line), Counter1 is Counter + 1, Mod is Counter1 mod 500, ( Mod = 0 -> format('~d ', [Counter1]), flush_output(user) ; true ), parse_annotated_wavfiles_stream1(Line, Streams, Counter1). parse_annotated_wavfiles_stream1(Line, _Streams, _Counter) :- Line = eof, !. parse_annotated_wavfiles_stream1(Line, [SIn | OutStreams], Counter) :- parse_annotated_wavfiles_line(Line, OutStreams), !, parse_annotated_wavfiles_stream([SIn | OutStreams], Counter). %-------------------------------------------------------------------------------------------- parse_annotated_wavfiles_line(Line, OutStreams) :- parse_annotated_wavfile_line(Line, ParsedLine), parse_annotated_wavfiles_line1(ParsedLine, OutStreams), !. parse_annotated_wavfiles_line(Line, OutStreams) :- format('N*** Error: bad call: ~q~n', [parse_annotated_wavfiles_line(Line, OutStreams)]), fail. parse_annotated_wavfiles_line1(null, _OutStreams) :- !. parse_annotated_wavfiles_line1(batchrec_command(BatchrecCommand), [SOutWavs, _SOutTranscriptions, _SOutSents, _SOutProlog]) :- format(SOutWavs, '~N~w~n', [BatchrecCommand]), !. parse_annotated_wavfiles_line1(annotated_wavfile_line(Wavfile, AnnotationsOS, Transcription), OutStreams) :- OutStreams = [SOutWavs, SOutTranscriptions, SOutSents, SOutProlog], format(SOutWavs, '~N~w~n', [Wavfile]), ( Transcription = [] -> format(SOutTranscriptions, '~N~w~n', [Wavfile]) ; join_with_spaces(Transcription, TranscriptionAtom), format(SOutTranscriptions, '~N~w ~w~n', [Wavfile, TranscriptionAtom]), format(SOutSents, '~N~q.~n', [sent(TranscriptionAtom, AnnotationsOS)]) ), format(SOutProlog, '~N~q.~n', [wavfile_and_annotations(Wavfile, AnnotationsOS, Transcription)]), !. %-------------------------------------------------------------------------------------------- parse_annotated_wavfile_line(Line, ParsedLine) :- Line = [], !, ParsedLine = null. parse_annotated_wavfile_line(Line, ParsedLine) :- Line = [0'# | _Rest], !, ParsedLine = null. parse_annotated_wavfile_line(Line, ParsedLine) :- split_string_into_words(Line, 0'#, PreAndPostCommentParts), ( PreAndPostCommentParts = [] -> ParsedLine = null ; PreAndPostCommentParts = [PreCommentPart | _Rest], parse_annotated_wavfile_line1(PreCommentPart, ParsedLine) ), !. parse_annotated_wavfile_line(Line, _ParsedLine) :- format('N*** Error: bad line in call to parse_annotated_wavfile_line: "~s"~n', [Line]), fail. parse_annotated_wavfile_line1(Atom, batchrec_command(Atom)) :- looks_like_batchrec_command(Atom), !. parse_annotated_wavfile_line1(LineAtom, annotated_wavfile_line(AbsWavfile, AnnotationsOS, Transcription)) :- split_atom_into_words(LineAtom, 0'|, BarComponents), ( BarComponents = [WavfileAtom, AnnotationsAtom, TranscriptionAtom, _Comments] -> split_atom_into_words(TranscriptionAtom, Transcription) ; BarComponents = [WavfileAtom, AnnotationsAtom, TranscriptionAtom] -> split_atom_into_words(TranscriptionAtom, Transcription) ; BarComponents = [WavfileAtom, AnnotationsAtom] -> Transcription = [] ), split_atom_into_words(WavfileAtom, [Wavfile]), absolute_file_name(Wavfile, AbsWavfile), split_atom_into_words(AnnotationsAtom, Annotations), coerce_list_elements_to_numbers_if_possible(Annotations, Annotations1), list_to_ord_set(Annotations1, AnnotationsOS), !. %-------------------------------------------------------------------------------------------- looks_like_batchrec_command(BatchrecCommandAtom) :- atom(BatchrecCommandAtom), atom_chars(BatchrecCommandAtom, Chars), Chars = [0'* | _Rest], !. lists_share_atom(L1, L2) :- member(X, L1), member(X, L2), !.