From 044c3f5fae0f04b6b484eb83c390df6638fe2b51 Mon Sep 17 00:00:00 2001
From: Henry Whitney <henry_whitney@wycliffeassociates.org>
Date: Mon, 29 Jun 2020 15:51:43 -0500
Subject: [PATCH] work on tagged OGNT

---
 Tagged_OGNT/Build_ULB_for_Tagging.0.pl | 63 +++++++++++++++++++++
 Tagged_OGNT/Build_ULB_for_Tagging.pl   | 78 ++++++++++++++++++++++++++
 Tagged_OGNT/Data/USFM_markers.txt      |  4 +-
 Tagged_OGNT/Tag_OGNT.pl                |  8 +--
 4 files changed, 147 insertions(+), 6 deletions(-)
 create mode 100644 Tagged_OGNT/Build_ULB_for_Tagging.0.pl
 create mode 100644 Tagged_OGNT/Build_ULB_for_Tagging.pl
diff --git a/Tagged_OGNT/Build_ULB_for_Tagging.0.pl b/Tagged_OGNT/Build_ULB_for_Tagging.0.pl
new file mode 100644
index 0000000..a4cd80b
--- /dev/null
+++ b/Tagged_OGNT/Build_ULB_for_Tagging.0.pl
@@ -0,0 +1,63 @@
+use 5.18.0;
+use File::Slurp;
+use File::Find ; 
+use Cwd ; 
+use utf8;
+#use open IN => ":utf8", OUT => ":utf8";
+use open IO => ":utf8";
+
+open(LOG, ">Logs/Log.txt") or die "$!";
+open(OUT, ">Output/ULB.xml") or die "$!";
+say OUT "<xml>";
+
+my ($topDir, $outDir) = ("/Users/Henry/Documents/WACS/en_ulb", "/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/Output");
+
+my @filesToRun = ();
+my $filePattern = '63-1JN\.usfm' ;
+my $file;
+find( sub { push @filesToRun, $File::Find::name  if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
+
+ReadFiles();
+
+say OUT "</xml>";
+close OUT;
+close LOG;
+
+say "\nDone.";
+# =====
+sub ReadFiles {
+
+	foreach  $file ( @filesToRun  ) {
+		say $file;
+		my $fileText = read_file("$file", binmode => 'utf8');
+		#say LOG $fileText;
+
+#Delete \n
+		my ($book, $chap, $vers);
+		if ($fileText =~ /\\h ([^\n]*)/) {
+			$book = $1
+		}
+		#say LOG $book;
+		$fileText =~ s/\n/ /g;
+		$fileText =~ s/  / /g;
+		#say LOG $fileText;
+		$fileText =~ s/^([^\n]*?)(\\s5)/\t<book name="$book">\n\t\t<heading>$1<\/heading>$2/;
+		$fileText =~ s/\\v/√/g;
+		$fileText =~ s/\\s5[^√]*?\\c (\d+)/\n$&/g;
+		if ($fileText =~ s#\\s5 \\c (\d+)[^\n]*#\t\t<chapter name="$book $1">\n$&\n\t\t</chapter>#g) {$chap = $1}
+		$fileText =~ s/(<chapter[^>]*>\n)([^\n]*?\\c \d+) /$1\t\t\t<preVerse name="$book $chap:0">$2<\/preVerse>\n/gs;
+		$fileText =~ s/(\\s5[^\n√]*)√ (\d+) /\n\t\t\t<preVerse name="$book $chap:$2">$1\\v$2<\/preVerse>\n/g;
+		say LOG $fileText;
+		$fileText =~ s/\n([^\n√]*)/\n\t\t\t\t<preVerse>$1<\/preVerse>\n/g;
+		#while ($fileText =~ s/(<preVerse name="([^:]*:)\d+">.*?</preVerse>\n)(\\p √ (\d+)) /$1<>/) {	}
+		
+		$fileText =~ s/√/\\v/g;
+		#$fileText =~ s/(\\s5.*?\\v \d+ )/\t\t\t<preVerse>$1<\/preVerse>\n/g;
+	#Capture heading and text
+	#Capture chapters
+#Capture verses
+		say OUT $fileText;
+		say OUT "\t</book>";
+	} 
+	
+}
diff --git a/Tagged_OGNT/Build_ULB_for_Tagging.pl b/Tagged_OGNT/Build_ULB_for_Tagging.pl
new file mode 100644
index 0000000..c44ef18
--- /dev/null
+++ b/Tagged_OGNT/Build_ULB_for_Tagging.pl
@@ -0,0 +1,78 @@
+# second try, reading file line by line
+
+use 5.18.0;
+use File::Slurp;
+use File::Find ; 
+use Cwd ; 
+use utf8;
+#use open IN => ":utf8", OUT => ":utf8";
+use open IO => ":utf8";
+
+open(LOG, ">Logs/Log.txt") or die "$!";
+open(OUT, ">Output/ULB.xml") or die "$!";
+say OUT "<xml>";
+
+my ($topDir, $outDir) = ("/Users/Henry/Documents/WACS/en_ulb", "/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/Output");
+
+my @filesToRun = ();
+my $filePattern = '63-1JN\.usfm' ;
+my $file;
+find( sub { push @filesToRun, $File::Find::name  if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
+
+ReadFiles();
+
+say OUT "</xml>";
+close OUT;
+close LOG;
+
+say "\nDone.";
+# =====
+sub ReadFiles {
+
+	foreach  $file ( @filesToRun  ) {
+		say $file;
+
+		open(IN, $file) or die "$!";
+		
+		my ($heading, $book, $chap, $vers, $flag);
+
+		while (<IN>) {
+			chomp;
+			say LOG $_;
+			if ($flag) {
+				die
+			}
+			elsif (/^\\(ide?|h|toc\d|mt|cl)/) {
+				$heading .= "$_ ";
+			}
+			else {
+				$flag = 1;
+				say OUT "\t<heading>$heading</heading>"
+			}
+		#if ($fileText =~ /\\h ([^\n]*)/) {
+		#	$book = $1
+		#}
+		##say LOG $book;
+		#$fileText =~ s/\n/ /g;
+		#$fileText =~ s/  / /g;
+		##say LOG $fileText;
+		#$fileText =~ s/^([^\n]*?)(\\s5)/\t<book name="$book">\n\t\t<heading>$1<\/heading>$2/;
+		#$fileText =~ s/\\v/√/g;
+		#$fileText =~ s/\\s5[^√]*?\\c (\d+)/\n$&/g;
+		#if ($fileText =~ s#\\s5 \\c (\d+)[^\n]*#\t\t<chapter name="$book $1">\n$&\n\t\t</chapter>#g) {$chap = $1}
+		#$fileText =~ s/(<chapter[^>]*>\n)([^\n]*?\\c \d+) /$1\t\t\t<preVerse name="$book $chap:0">$2<\/preVerse>\n/gs;
+		#$fileText =~ s/(\\s5[^\n√]*)√ (\d+) /\n\t\t\t<preVerse name="$book $chap:$2">$1\\v$2<\/preVerse>\n/g;
+		#say LOG $fileText;
+		#$fileText =~ s/\n([^\n√]*)/\n\t\t\t\t<preVerse>$1<\/preVerse>\n/g;
+		##while ($fileText =~ s/(<preVerse name="([^:]*:)\d+">.*?</preVerse>\n)(\\p √ (\d+)) /$1<>/) {	}
+		#
+		#$fileText =~ s/√/\\v/g;
+		#$fileText =~ s/(\\s5.*?\\v \d+ )/\t\t\t<preVerse>$1<\/preVerse>\n/g;
+	#Capture heading and text
+	#Capture chapters
+#Capture verses
+		}
+		close IN;
+	} 
+	
+}
diff --git a/Tagged_OGNT/Data/USFM_markers.txt b/Tagged_OGNT/Data/USFM_markers.txt
index 04da42e..4334b6c 100644
--- a/Tagged_OGNT/Data/USFM_markers.txt
+++ b/Tagged_OGNT/Data/USFM_markers.txt
@@ -13,9 +13,9 @@
 \fqa
 \fqa*
 \ft
-\m
+\m	# continued paragraph from before (quote or poetry), no indent
 \ms	# psalms section heading
-\nb	# follows chapter line
+\nb	# no break from previous paragraph; follows chapter line
 \p
 \pi	# special formatting
 \q
diff --git a/Tagged_OGNT/Tag_OGNT.pl b/Tagged_OGNT/Tag_OGNT.pl
index aba42da..4541fa1 100644
--- a/Tagged_OGNT/Tag_OGNT.pl
+++ b/Tagged_OGNT/Tag_OGNT.pl
@@ -1,7 +1,7 @@
 # Takes current tW entries and populates tagged OGNT XML
 # This is the current best version
-#  It takes care of all entries but doesn't account for USFM codes in ULB
-# Trying to get it to work with repeated instances of same word.
+#  Requires ULB that includes USFMs.
+
 use 5.12.0;
 use File::Slurp;
 use File::Find ; 
@@ -38,7 +38,7 @@ close LOG;
 open(LOG, ">Logs/Log.txt") or die "$!";
 
 LongBookNames();
-Read_ULB_File();
+Prepare_ULB_file();
 ProcessXML();
 #	put unused SN at end of verse
 
@@ -289,7 +289,7 @@ sub FixWorkText {
 	}
 	return ($text)
 }
-sub Read_ULB_File {
+sub Prepare_ULB_file {
 
 	$ULBText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');