update for MAST PDF

2020-07-07 17:39:54 -04:00 · 2020-07-07 17:39:54 -04:00 · cb1679ac8f
parent c94831a0b2
commit cb1679ac8f
6 changed files with 487 additions and 545 deletions
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Exceptions/Exceptions.txt
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Exceptions/Exceptions.txt
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Mine.URL.Strong.Verse.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Mine.URL.Strong.Verse.pl
@ -349,13 +349,13 @@ sub Finish {
 		$tW_files .= "$key ";
 		say LOG "\$key: $key\t\$tW_file{$key}: $tW_file{$key}"
 	}
-	#say "\nOpening .md files.";
+	say "\nOpening .md files\n\$tW_files: $tW_files";
-	#if ($^O eq "darwin") {
+	if ($^O eq "darwin") {
-	#	#system `$browser https://www.blueletterbible.org/lang/lexicon/lexicon.cfm?strongs=$strong`;
+		#system `$browser https://www.blueletterbible.org/lang/lexicon/lexicon.cfm?strongs=$strong`;
-	#	system "perl $Bin/get_strongs_gist.pl";
+		system "perl $Bin/get_strongs_gist.pl";
-	#	system `$textEditor $tW_files`;
+		system `$textEditor $tW_files`;
-	#	system `$textEditor $exceptions_file`;
+		system `$textEditor $exceptions_file`;
-	#}
+	}
 	#if ($^O eq "linux") {
 	#	say "curl $intrln_ref > $Bin/Temp/This_interlinear.html";
 	#	system "curl $intrln_ref > $Bin/Temp/This_interlinear.html";
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Restructure_names.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Restructure_names.pl
@ -1,45 +0,0 @@
 use 5.12.0;
 use File::Slurp;
 use File::Find ; 
 use Cwd ;
 use utf8;
 #use open IN => ":utf8", OUT => ":utf8";
 use open IO => ":utf8";
 open(LOG, ">/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Logs/log.txt") or die "$!";
 my $topDir = "/Users/Henry/Documents/WACS/Restructure/bible/names";
 my @filesToRun = ();
 my $filePattern = '*.md' ; 
 find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
 foreach  my $file ( @filesToRun  )
 {
 	say LOG $file;
 	my $shortFile = $file;
 	$shortFile =~ s/^.*\/([^\/]*\.md)$/\/Users\/Henry\/Documents\/WACS\/Tips_and_Hacks\/MAST_tW_PDF_Updater\/FilesForUpdates\/Output\/names\/$1/;
 	my $fileText = read_file($file, binmode => 'utf8');
 	if ($fileText =~ /Forms Found in the English ULB/) { say LOG "\tForms Found in the English ULB"; }
 	else {
 		my ($nameLine, $mainName, $otherNames, $mainText);
 		if ($fileText =~ /^# ([^\n]*)\n(.*)$/s) {
 			($nameLine, $mainText) = ($1, $2);
 			say LOG "\$nameLine: $nameLine\n\$mainText:\n$mainText\n\n";
 			if ($nameLine =~ /^([^,]*), (.*)$/) {
 				($mainName, $otherNames) = ($1, $2);
 			} else {
 				$mainName = $nameLine
 			}
 			$fileText = "# $mainName\n\n$mainText\n\n## Forms Found in the English ULB:\n\n$nameLine";
 			$fileText =~ s/\n{3,}/\n\n/g;
 			open(OUT, ">$shortFile") or die "$!";
 			say OUT $fileText;
 			close OUT;
 		}
 	}
 } 
 say "Done."
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Restructure_tW_pages.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Restructure_tW_pages.pl
@ -1,62 +0,0 @@
 # Adds Synonyms and Related Words section and
 # Forms Found in the English ULB section
 # to tW pages
 use 5.12.0;
 use File::Slurp;
 use File::Find ; 
 use Cwd ; 
 use utf8;
 #use open IN => ":utf8", OUT => ":utf8";
 use open IO => ":utf8";
 open LOG, ">Logs/log.log";
 my $topDir = "/Users/Henry/Documents/WACS/W_Q_Restructure/bible";
 my $topOutDir = "/Users/Henry/Documents/WACS/W_Q_Restructure_new/bible";
 my @filesToRun = ();
 my $filePattern = '*.md' ; 
 find( sub { push @filesToRun, $File::Find::name  if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
 foreach  my $file ( @filesToRun  ) {
 	say $file;
 	my $fileText = read_file("$file", binmode => 'utf8');
 	my $outText = Process($fileText);
 	Output($file, $outText);
 } 
 close LOG;
 say "Done.";
 # =====================
 sub Process {
 	my $text = $_[0];
 	my ($entries, $keyWord, $bulk, $forms);
 	if ($text =~ /^# ([^\n]*)\n/) {
 		$entries = $1;
 	}
 	if ($text =~ /^# (([^\n,]*)(\n|,))/) {
 		$keyWord = $2
 	}
 	if ($text =~ /(## (Facts|Definition):.*)$/s) {
 		$bulk = $1
 	}
 	my @forms = split /, /, $entries;
 	@forms = sort @forms;
 	$forms = join(', ', @forms);
 	$text = "# $keyWord\n\n## Synonyms and Related Words:\n\n$forms\n\n$bulk\n\n## Forms Found in the English ULB\n\n$forms\n\n\n\n";
 	while ($text =~ s/\n{3,}/\n\n/g) {}
 	#$text =~ s/\n+$/\n/;
 	return $text;
 }
 sub Output {
 	my ($OutFile, $text) = ($_[0], $_[1]);
 	$OutFile =~ s/$topDir/$topOutDir/;
 	open(OUT, ">$OutFile") or die $!;
 	print OUT $text;
 	close OUT
 }
--- a/MAST_tW_PDF_Updater/FilesForUpdates/User/User_defaults.mac.txt
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/User/User_defaults.mac.txt
@ -11,9 +11,9 @@ Repository directory: /Users/Henry/Documents/WACS
 translationNotes path: en_tn
 Unlocked Literal Bible path: en_ulb
-# translationNotes path: gl_.*_tn
+# translationNotes path: en_tn
- translationWords path: gl_.*_bible.en_tw
+translationWords path: en_tw/bible
-# Unlocked Literal Bible path: gl_.*_ulb
+# Unlocked Literal Bible path: en_ulb
 Hebrew Bible XML directory: MAST_HB
 Greek Bible XML directory: OGNT
--- a/MAST_tW_PDF_Updater/FilesForUpdates/tWs.from.MAST.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/tWs.from.MAST.pl
@ -61,7 +61,7 @@ my (@fileList);
 # ==============================
 chdir("$pwd");
-open LOG, ">:utf8", "Logs${d}Exc_log.log" or die "\$log: Logs${d}Exc_log.log: $!";
+open LOG, ">:utf8", "Logs${d}1_Data_and_inputs.txt" or die "\$log: Logs${d}1_Data_andinputs.txt: $!";
 open OUT, ">:utf8", $output or die "$!";
 open MISSING, ">$missing" or die "$!";
@ -79,12 +79,14 @@ GetUserDefaults();
 GetULBBooksToProcess();
 ReadExceptions();
 close LOG;
-open LOG, ">:utf8", "Logs${d}tW_pairs_log.txt" or die "Logs${d}tW_pairs_log.txt: $!";
+open LOG, ">:utf8", "Logs${d}2_tW_pairs_log.txt" or die "Logs${d}2_tW_pairs_log.txt: $!";
 PairtWEntriesTotWPageAndUniqSNs();
 close LOG;
-open LOG, ">:utf8", "Logs${d}tWs_from_MAST_log.txt" or die "tWs_from_MAST_log.txt: $!";
+open LOG, ">:utf8", "Logs${d}3_tWs_from_MAST_log.txt" or die "3_tWs_from_MAST_log.txt: $!";
 GetRelevantSNsForEachVerse();
 LinkULBtoCV();
 close LOG;
 open LOG, ">:utf8", "Logs${d}4_Process_log.txt" or die "4_Process_log.txt: $!";
 ProcessEachVerse();
 say OUT $finalOutString;
@ -123,6 +125,8 @@ sub GetUserDefaults {
 		die "No path to repo found" if $repoPath eq "";
 		($topTwDir, $topOTSourceLangDir, $topNTSourceLangDir) = ("$repoPath${d}$twPath", "$repoPath${d}MAST_HB", "$repoPath${d}OGNT");
 		say LOG "\$topTwDir: $topTwDir\n\$topOTSourceLangDir: $topOTSourceLangDir\n\$topNTSourceLangDir: $topNTSourceLangDir	";
 	close $defaults;
 }
@ -144,12 +148,13 @@ sub GetULBBooksToProcess {
 				}
 				$sourceFile = "$topSourceLangDir${d}$this_bk.xml";
 				say LOG $sourceFile;
 				push @fileList, $sourceFile;
 			}
 		}
 	close $file;
-	#say LOG "\@fileList:\n@fileList";
+	say LOG "===\n\@fileList:\n@fileList\n===\n";
 }
 sub ReadExceptions {
@ -162,23 +167,24 @@ sub ReadExceptions {
 		#say LOG $line;
 		my $rf;
 		if ($line =~ /^([^#\n][^\t\n]*)\t([GH]\d+)\t\|\|$/) {
-			my ($oldNew) = ($2);
+			my ($SNtoSkip) = ($2);
 			$rf = $1;
-			#say LOG "\$line: $line, \$rf: $rf, \$oldNew: $oldNew";
+			say LOG "<1>\t\$line: $line, \$rf: $rf, \$SNtoSkip: $SNtoSkip";
-			($deleteNum{$rf}) .= "$oldNew√";
+			($deleteNum{$rf}) .= "$SNtoSkip√";
 			$specifiedText{$rf} = 1;
 			#say LOG "\$specifiedText{$rf}: $specifiedText{$rf}";
-		} elsif ($line =~ /^([^#\n][^\t\n]*)\t(\d+\t\d+)/) {
+		} elsif ($line =~ /^([^#\n][^\t\n]*)\t([GH]\d+\t[GH]\d+)/) {
 			my ($oldNew) = ($2);
 			$rf = $1;
-			#say LOG "\$line: $line, \$rf: $rf, \$oldNew: $oldNew";
+			say LOG "<2>\t\$line: $line, \$rf: $rf, \$oldNew: $oldNew";
 			($adjust{$rf}) .= "$oldNew√";
 			$specifiedText{$rf} = 1;
 		}
 		elsif ($line =~ /^([^#\n\t][^\t\n]*)\t(.\d+)\t([^\t\n]*)\t([^\t\n]*)$/) {
 			my ($rf, $sn, $snippet, $page) = ($1, $2, $3, $4);
-			#say LOG "\$line: $line, \$rf: $rf, \$oldNew: $oldNew";
+			say LOG "<3>\t\$rf: $rf,	 \$sn: $sn,	 \$snippet: $snippet,	 \$page: $page	";
 			$specifiedEntries{$rf} .= "$sn≈$snippet≈$page√";
 			$relevantSNsInCV{$rf} =~ s/$sn√?//;
 			$specifiedText{$rf} = 1;
 		}
@ -206,7 +212,7 @@ sub PairtWEntriesTotWPageAndUniqSNs {
 		if ($file =~ /\/([^\/]*)\/[^\/]*\.md/) {
 			$dir{$shortFile} = $1
 		}
-		say LOG "<0>\$shortFile: $shortFile\t\$dir{$shortFile}: $dir{$shortFile}";
+		say LOG "<4>\$shortFile: $shortFile\t\$dir{$shortFile}: $dir{$shortFile}";
 		#say "|$shortFile|"; die;
 		#if ($shortFile =~ /^(kt|names)/) {
 		#my $fileText = read_file("$file", binmode => 'utf8');
@ -297,12 +303,13 @@ sub GetRelevantSNsForEachVerse {
 		if ($sourceFile =~ /(..)-...\.xml$/) {
 			$hg = "H" if ($1 < 40);
 		}
-		#say LOG "opening \$sourceFile: $sourceFile";
+		say LOG "opening \$sourceFile: $sourceFile";
 		open IN, "$sourceFile" or die "$sourceFile can't be opened\n\n";
 			my ($thisBook, $thisChap, $thisVers, $thisCV);
 			my (@pages);
 			while (<IN>) {
 				chomp;
 				say LOG "<\@>\t$_";
 				if (/<verse osisID="([^\.]*).(\d+).(\d+)">/) {
 					#say LOG "$thisCV: \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";# Making sure previous verse is populated
 					my ($bk, $ch, $vs) = ($1, $2, $3);
@ -315,15 +322,20 @@ sub GetRelevantSNsForEachVerse {
 					#say LOG "##\t$bk $ch:$vs, $thisCV";
 				}
 				else {
-					s/(lemma=").*?(\d+).*?("\n)/$1$2$3/;
+					if (/lemma="([^"]*)"/) {
-					while (/<w lemma="(\d+)"/g) {
+						my $gist = $1;
-						#say LOG $_;
+						say LOG "<\@\@>\t\$gist: $gist";
-						my ($thisSN) = ($hg . $1);
+						if ($gist =~ /\d+/) {
-						#say LOG "\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
+							s/(lemma=")[^\d]*?(\d+)[^\d]*?(")/$1$2$3/;
-						if (exists $relevantSNs{$thisSN}) {
+							while (/<w lemma="(\d+)"/g) {
-							$relevantSNsInCV{$thisCV} .= "$thisSN√" unless ($relevantSNsInCV{$thisCV} =~ /\b$thisSN\b/);
+								my ($thisSN) = ($hg . $1);
 								say LOG "<\@\@\@>\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
 								if (exists $relevantSNs{$thisSN}) {
 									$relevantSNsInCV{$thisCV} .= "$thisSN√" unless ($relevantSNsInCV{$thisCV} =~ /\b$thisSN\b/);
 								}
 								say LOG ">\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
 							}
 						}
 						#say LOG ">\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
 					}
 				}
 			}
@ -348,16 +360,18 @@ sub ProcessEachVerse  {
 	foreach my $key (sort keys %orderRef) {
 	# for each verse
 		my ($thisCV) = ($orderRef{$key});
-		say LOG "\n<1>\n$thisCV\t$ULBtext{$thisCV}\n\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}\t\$deleteNum{$thisCV}: $deleteNum{$thisCV}, \$specifiedEntries{$thisCV}: $specifiedEntries{$thisCV}";
+		say LOG "\n<5>\n$thisCV\t$ULBtext{$thisCV}\n\t\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}\n\t\$deleteNum{$thisCV}: $deleteNum{$thisCV},\n\t\$specifiedEntries{$thisCV}: $specifiedEntries{$thisCV}";
 		($relevantSNsInCV{$thisCV}) = DeleteSpecifiedWords ($relevantSNsInCV{$thisCV}, $specifiedEntries{$thisCV});
 		($relevantSNsInCV{$thisCV}) = DeleteObviatedSNs($relevantSNsInCV{$thisCV}, $deleteNum{$thisCV});
 		# delete obviated SNs
-		say LOG "\t\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
+		say LOG "<6>\t\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
 		my $processSequence = "$specifiedEntries{$thisCV}√$relevantSNsInCV{$thisCV}";
 		$processSequence =~ s/√+/√/g;
 		$processSequence =~ s/^√+//;
-		say LOG "\t\t\$processSequence: $processSequence";
+		say LOG "\t\$processSequence: $processSequence";
 		$finalOutString .= ExecuteProcessSequence($thisCV, $processSequence, $ULBtext{$thisCV});
@ -365,9 +379,22 @@ sub ProcessEachVerse  {
 }
 sub DeleteSpecifiedWords {
 	my ($sns, $toDelete) = @_;
 	say LOG "<5.1>\t\$sns:\t\t$sns\n\t\t\$toDelete:\t$toDelete";
 	my @delete = split /√/, $toDelete;
 	foreach my $one (@delete) {
 		say LOG "<5.1.1>\t\$one: $one";
 		$one =~ s/^([^≈]*)≈.*$/$1/;
 		say LOG "<5.1.2>\t\$one: $one";
 		$sns =~ s/^(.*)$one(√)?(.*)$/$1$2$3/;
 	}
 	return $sns;
 }
 sub DeleteObviatedSNs {
 	my ($sns, $toDelete) = @_;
-	my @sns = split /√/, $sns;
+	say LOG "<5.2>\t\$sns:\t\t$sns\n\t\t\$toDelete:\t$toDelete";
 	my @delete = split /√/, $toDelete;
 	foreach my $one (@delete) {
 		$sns =~ s/^(.*)$one(√)?(.*)$/$1$2$3/;
@ -379,7 +406,7 @@ sub ExecuteProcessSequence {
 	my ($tempText, $thisCVOutString, $position, $outputFormRef) = ($trueText, "");
 	my (%snippetSequence);
 	my (@SNsequence) = split /√/, $sequence;
-	say LOG "$ref: @SNsequence";
+	say LOG "$ref:\n@SNsequence";
 	if ($ref =~ /^([^:]*) (\d+):(\d+)/) {
 		$outputFormRef = "$1,$2,$3"
 	}
@ -388,13 +415,15 @@ sub ExecuteProcessSequence {
 		# for each relevant SN in verse
 		# 	for each tW entry
 		#		if specified tW
-		say LOG "=====\n\$ref: $ref\t\$candidate: $candidate\t\$entriesThisSN{$candidate}: $entriesThisSN{$candidate}";
+		say LOG "=====\n\$ref: $ref\t\$candidate: $candidate\t\$entriesThisSN{$candidate}: $entriesThisSN{$candidate}\n$tempText";
 		my ($found, $sn, $ulbWord, $tWpage);
 		if ($candidate =~ /([^≈]*)≈([^≈]*)≈([^≈]*)/) {
 		#			get position in true text to array
 		#			delete found text from temp text
 			($sn, $ulbWord, $tWpage) = ($1,$2,$3);
 			while ($ulbWord =~ s/^(.*) \.\.\. (.*) \.\.\. (.*)/($1)\\b(.*?)\\b($2)\\b(.*?)\\b($3)/) {}
 			while ($ulbWord =~ s/^(.*) \.\.\. (.*)/($1)\\b(.*?)\\b($2)/) {}
 			say LOG "<A>\t\$ulbWord: $ulbWord";
 			if ($tempText =~ s/^(.*)\b$ulbWord\b(.*)$/$1$2/) {
 				$position = length $1;
 				$snippetSequence{$position} = "$ulbWord,$dir{$tWpage},$tWpage";
@ -416,16 +445,18 @@ sub ExecuteProcessSequence {
 		#					get ULB snippet to verse match list
 		#					get position in true text to array
 		#					delete found text from temp text
-			if ($thisEntry =~ /\(\.\*\?\)/ && $tempText =~ s/^(.*)\b($thisEntry)\b(.*)$/$1$3/i) {
+			if ($thisEntry =~ /\(\.\*\?\)/ && $tempText =~ /$thisEntry/i) {
-				print LOG "<2>\t\$thisEntry |$thisEntry| is found in the first test\n___";
+				say LOG "<7>\t\$thisEntry |$thisEntry| is found in the first test";
-				$tempText =~ s/^(.*)\b$thisEntry\b(.*)$/$1$2/;
+				if ($tempText =~ s/^(.*)\b($thisEntry)\b(.*)$/$1$3$4/i) {
 					say LOG "<7.1>\t\$1: $1	\$2: $2	\$3: $3	\$4: $4\t\$5: $5";
 				}
 				if ($trueText =~ /^(.*)\b($thisEntry)\b.*$/) {
 					$position = length $1;
 				}
 				$snippetSequence{$position} = "$thisEntry,$dir{$pagesThisEntry{$thisEntry}},$pagesThisEntry{$thisEntry}";
 					$found = 1;
 					goto Breakout;
-				}
+			}
 			elsif ($tempText =~ s/\b($thisEntry)[^\w']//i || $tempText =~ s/\b($thisEntry)["']//i || $tempText =~ s/["']($thisEntry)\b//i) {
 				say LOG "\$thisEntry |$thisEntry| is found in the second test\n---
 				";