work on tagged OGNT

This commit is contained in:
Henry Whitney 2020-06-29 15:51:43 -05:00
parent 182dfeb169
commit 044c3f5fae
4 changed files with 147 additions and 6 deletions

View File

@ -0,0 +1,63 @@
use 5.18.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open(LOG, ">Logs/Log.txt") or die "$!";
open(OUT, ">Output/ULB.xml") or die "$!";
say OUT "<xml>";
my ($topDir, $outDir) = ("/Users/Henry/Documents/WACS/en_ulb", "/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/Output");
my @filesToRun = ();
my $filePattern = '63-1JN\.usfm' ;
my $file;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
ReadFiles();
say OUT "</xml>";
close OUT;
close LOG;
say "\nDone.";
# =====
sub ReadFiles {
foreach $file ( @filesToRun ) {
say $file;
my $fileText = read_file("$file", binmode => 'utf8');
#say LOG $fileText;
#Delete \n
my ($book, $chap, $vers);
if ($fileText =~ /\\h ([^\n]*)/) {
$book = $1
}
#say LOG $book;
$fileText =~ s/\n/ /g;
$fileText =~ s/ / /g;
#say LOG $fileText;
$fileText =~ s/^([^\n]*?)(\\s5)/\t<book name="$book">\n\t\t<heading>$1<\/heading>$2/;
$fileText =~ s/\\v/√/g;
$fileText =~ s/\\s5[^√]*?\\c (\d+)/\n$&/g;
if ($fileText =~ s#\\s5 \\c (\d+)[^\n]*#\t\t<chapter name="$book $1">\n$&\n\t\t</chapter>#g) {$chap = $1}
$fileText =~ s/(<chapter[^>]*>\n)([^\n]*?\\c \d+) /$1\t\t\t<preVerse name="$book $chap:0">$2<\/preVerse>\n/gs;
$fileText =~ s/(\\s5[^\n√]*)√ (\d+) /\n\t\t\t<preVerse name="$book $chap:$2">$1\\v$2<\/preVerse>\n/g;
say LOG $fileText;
$fileText =~ s/\n([^\n√]*)/\n\t\t\t\t<preVerse>$1<\/preVerse>\n/g;
#while ($fileText =~ s/(<preVerse name="([^:]*:)\d+">.*?</preVerse>\n)(\\p √ (\d+)) /$1<>/) { }
$fileText =~ s/√/\\v/g;
#$fileText =~ s/(\\s5.*?\\v \d+ )/\t\t\t<preVerse>$1<\/preVerse>\n/g;
#Capture heading and text
#Capture chapters
#Capture verses
say OUT $fileText;
say OUT "\t</book>";
}
}

View File

@ -0,0 +1,78 @@
# second try, reading file line by line
use 5.18.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open(LOG, ">Logs/Log.txt") or die "$!";
open(OUT, ">Output/ULB.xml") or die "$!";
say OUT "<xml>";
my ($topDir, $outDir) = ("/Users/Henry/Documents/WACS/en_ulb", "/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/Output");
my @filesToRun = ();
my $filePattern = '63-1JN\.usfm' ;
my $file;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
ReadFiles();
say OUT "</xml>";
close OUT;
close LOG;
say "\nDone.";
# =====
sub ReadFiles {
foreach $file ( @filesToRun ) {
say $file;
open(IN, $file) or die "$!";
my ($heading, $book, $chap, $vers, $flag);
while (<IN>) {
chomp;
say LOG $_;
if ($flag) {
die
}
elsif (/^\\(ide?|h|toc\d|mt|cl)/) {
$heading .= "$_ ";
}
else {
$flag = 1;
say OUT "\t<heading>$heading</heading>"
}
#if ($fileText =~ /\\h ([^\n]*)/) {
# $book = $1
#}
##say LOG $book;
#$fileText =~ s/\n/ /g;
#$fileText =~ s/ / /g;
##say LOG $fileText;
#$fileText =~ s/^([^\n]*?)(\\s5)/\t<book name="$book">\n\t\t<heading>$1<\/heading>$2/;
#$fileText =~ s/\\v/√/g;
#$fileText =~ s/\\s5[^√]*?\\c (\d+)/\n$&/g;
#if ($fileText =~ s#\\s5 \\c (\d+)[^\n]*#\t\t<chapter name="$book $1">\n$&\n\t\t</chapter>#g) {$chap = $1}
#$fileText =~ s/(<chapter[^>]*>\n)([^\n]*?\\c \d+) /$1\t\t\t<preVerse name="$book $chap:0">$2<\/preVerse>\n/gs;
#$fileText =~ s/(\\s5[^\n√]*)√ (\d+) /\n\t\t\t<preVerse name="$book $chap:$2">$1\\v$2<\/preVerse>\n/g;
#say LOG $fileText;
#$fileText =~ s/\n([^\n√]*)/\n\t\t\t\t<preVerse>$1<\/preVerse>\n/g;
##while ($fileText =~ s/(<preVerse name="([^:]*:)\d+">.*?</preVerse>\n)(\\p √ (\d+)) /$1<>/) { }
#
#$fileText =~ s/√/\\v/g;
#$fileText =~ s/(\\s5.*?\\v \d+ )/\t\t\t<preVerse>$1<\/preVerse>\n/g;
#Capture heading and text
#Capture chapters
#Capture verses
}
close IN;
}
}

View File

@ -13,9 +13,9 @@
\fqa
\fqa*
\ft
\m
\m # continued paragraph from before (quote or poetry), no indent
\ms # psalms section heading
\nb # follows chapter line
\nb # no break from previous paragraph; follows chapter line
\p
\pi # special formatting
\q

View File

@ -1,7 +1,7 @@
# Takes current tW entries and populates tagged OGNT XML
# This is the current best version
# It takes care of all entries but doesn't account for USFM codes in ULB
# Trying to get it to work with repeated instances of same word.
# Requires ULB that includes USFMs.
use 5.12.0;
use File::Slurp;
use File::Find ;
@ -38,7 +38,7 @@ close LOG;
open(LOG, ">Logs/Log.txt") or die "$!";
LongBookNames();
Read_ULB_File();
Prepare_ULB_file();
ProcessXML();
# put unused SN at end of verse
@ -289,7 +289,7 @@ sub FixWorkText {
}
return ($text)
}
sub Read_ULB_File {
sub Prepare_ULB_file {
$ULBText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');