From 044c3f5fae0f04b6b484eb83c390df6638fe2b51 Mon Sep 17 00:00:00 2001 From: Henry Whitney Date: Mon, 29 Jun 2020 15:51:43 -0500 Subject: [PATCH] work on tagged OGNT --- Tagged_OGNT/Build_ULB_for_Tagging.0.pl | 63 +++++++++++++++++++++ Tagged_OGNT/Build_ULB_for_Tagging.pl | 78 ++++++++++++++++++++++++++ Tagged_OGNT/Data/USFM_markers.txt | 4 +- Tagged_OGNT/Tag_OGNT.pl | 8 +-- 4 files changed, 147 insertions(+), 6 deletions(-) create mode 100644 Tagged_OGNT/Build_ULB_for_Tagging.0.pl create mode 100644 Tagged_OGNT/Build_ULB_for_Tagging.pl diff --git a/Tagged_OGNT/Build_ULB_for_Tagging.0.pl b/Tagged_OGNT/Build_ULB_for_Tagging.0.pl new file mode 100644 index 0000000..a4cd80b --- /dev/null +++ b/Tagged_OGNT/Build_ULB_for_Tagging.0.pl @@ -0,0 +1,63 @@ +use 5.18.0; +use File::Slurp; +use File::Find ; +use Cwd ; +use utf8; +#use open IN => ":utf8", OUT => ":utf8"; +use open IO => ":utf8"; + +open(LOG, ">Logs/Log.txt") or die "$!"; +open(OUT, ">Output/ULB.xml") or die "$!"; +say OUT ""; + +my ($topDir, $outDir) = ("/Users/Henry/Documents/WACS/en_ulb", "/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/Output"); + +my @filesToRun = (); +my $filePattern = '63-1JN\.usfm' ; +my $file; +find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ; + +ReadFiles(); + +say OUT ""; +close OUT; +close LOG; + +say "\nDone."; +# ===== +sub ReadFiles { + + foreach $file ( @filesToRun ) { + say $file; + my $fileText = read_file("$file", binmode => 'utf8'); + #say LOG $fileText; + +#Delete \n + my ($book, $chap, $vers); + if ($fileText =~ /\\h ([^\n]*)/) { + $book = $1 + } + #say LOG $book; + $fileText =~ s/\n/ /g; + $fileText =~ s/ / /g; + #say LOG $fileText; + $fileText =~ s/^([^\n]*?)(\\s5)/\t\n\t\t$1<\/heading>$2/; + $fileText =~ s/\\v/√/g; + $fileText =~ s/\\s5[^√]*?\\c (\d+)/\n$&/g; + if ($fileText =~ s#\\s5 \\c (\d+)[^\n]*#\t\t\n$&\n\t\t#g) {$chap = $1} + $fileText =~ s/(]*>\n)([^\n]*?\\c \d+) /$1\t\t\t$2<\/preVerse>\n/gs; + $fileText =~ s/(\\s5[^\n√]*)√ (\d+) /\n\t\t\t$1\\v$2<\/preVerse>\n/g; + say LOG $fileText; + $fileText =~ s/\n([^\n√]*)/\n\t\t\t\t$1<\/preVerse>\n/g; + #while ($fileText =~ s/(.*?\n)(\\p √ (\d+)) /$1<>/) { } + + $fileText =~ s/√/\\v/g; + #$fileText =~ s/(\\s5.*?\\v \d+ )/\t\t\t$1<\/preVerse>\n/g; + #Capture heading and text + #Capture chapters +#Capture verses + say OUT $fileText; + say OUT "\t"; + } + +} diff --git a/Tagged_OGNT/Build_ULB_for_Tagging.pl b/Tagged_OGNT/Build_ULB_for_Tagging.pl new file mode 100644 index 0000000..c44ef18 --- /dev/null +++ b/Tagged_OGNT/Build_ULB_for_Tagging.pl @@ -0,0 +1,78 @@ +# second try, reading file line by line + +use 5.18.0; +use File::Slurp; +use File::Find ; +use Cwd ; +use utf8; +#use open IN => ":utf8", OUT => ":utf8"; +use open IO => ":utf8"; + +open(LOG, ">Logs/Log.txt") or die "$!"; +open(OUT, ">Output/ULB.xml") or die "$!"; +say OUT ""; + +my ($topDir, $outDir) = ("/Users/Henry/Documents/WACS/en_ulb", "/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/Output"); + +my @filesToRun = (); +my $filePattern = '63-1JN\.usfm' ; +my $file; +find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ; + +ReadFiles(); + +say OUT ""; +close OUT; +close LOG; + +say "\nDone."; +# ===== +sub ReadFiles { + + foreach $file ( @filesToRun ) { + say $file; + + open(IN, $file) or die "$!"; + + my ($heading, $book, $chap, $vers, $flag); + + while () { + chomp; + say LOG $_; + if ($flag) { + die + } + elsif (/^\\(ide?|h|toc\d|mt|cl)/) { + $heading .= "$_ "; + } + else { + $flag = 1; + say OUT "\t$heading" + } + #if ($fileText =~ /\\h ([^\n]*)/) { + # $book = $1 + #} + ##say LOG $book; + #$fileText =~ s/\n/ /g; + #$fileText =~ s/ / /g; + ##say LOG $fileText; + #$fileText =~ s/^([^\n]*?)(\\s5)/\t\n\t\t$1<\/heading>$2/; + #$fileText =~ s/\\v/√/g; + #$fileText =~ s/\\s5[^√]*?\\c (\d+)/\n$&/g; + #if ($fileText =~ s#\\s5 \\c (\d+)[^\n]*#\t\t\n$&\n\t\t#g) {$chap = $1} + #$fileText =~ s/(]*>\n)([^\n]*?\\c \d+) /$1\t\t\t$2<\/preVerse>\n/gs; + #$fileText =~ s/(\\s5[^\n√]*)√ (\d+) /\n\t\t\t$1\\v$2<\/preVerse>\n/g; + #say LOG $fileText; + #$fileText =~ s/\n([^\n√]*)/\n\t\t\t\t$1<\/preVerse>\n/g; + ##while ($fileText =~ s/(.*?\n)(\\p √ (\d+)) /$1<>/) { } + # + #$fileText =~ s/√/\\v/g; + #$fileText =~ s/(\\s5.*?\\v \d+ )/\t\t\t$1<\/preVerse>\n/g; + #Capture heading and text + #Capture chapters +#Capture verses + } + close IN; + } + +} diff --git a/Tagged_OGNT/Data/USFM_markers.txt b/Tagged_OGNT/Data/USFM_markers.txt index 04da42e..4334b6c 100644 --- a/Tagged_OGNT/Data/USFM_markers.txt +++ b/Tagged_OGNT/Data/USFM_markers.txt @@ -13,9 +13,9 @@ \fqa \fqa* \ft -\m +\m # continued paragraph from before (quote or poetry), no indent \ms # psalms section heading -\nb # follows chapter line +\nb # no break from previous paragraph; follows chapter line \p \pi # special formatting \q diff --git a/Tagged_OGNT/Tag_OGNT.pl b/Tagged_OGNT/Tag_OGNT.pl index aba42da..4541fa1 100644 --- a/Tagged_OGNT/Tag_OGNT.pl +++ b/Tagged_OGNT/Tag_OGNT.pl @@ -1,7 +1,7 @@ # Takes current tW entries and populates tagged OGNT XML # This is the current best version -# It takes care of all entries but doesn't account for USFM codes in ULB -# Trying to get it to work with repeated instances of same word. +# Requires ULB that includes USFMs. + use 5.12.0; use File::Slurp; use File::Find ; @@ -38,7 +38,7 @@ close LOG; open(LOG, ">Logs/Log.txt") or die "$!"; LongBookNames(); -Read_ULB_File(); +Prepare_ULB_file(); ProcessXML(); # put unused SN at end of verse @@ -289,7 +289,7 @@ sub FixWorkText { } return ($text) } -sub Read_ULB_File { +sub Prepare_ULB_file { $ULBText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');