update for MAST PDF

2020-07-07 17:39:54 -04:00 · 2020-07-07 17:39:54 -04:00 · cb1679ac8f
parent c94831a0b2
commit cb1679ac8f
6 changed files with 487 additions and 545 deletions
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Exceptions/Exceptions.txt
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Exceptions/Exceptions.txt
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Mine.URL.Strong.Verse.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Mine.URL.Strong.Verse.pl
@ -349,13 +349,13 @@ sub Finish {
 		$tW_files .= "$key ";
 		say LOG "\$key: $key\t\$tW_file{$key}: $tW_file{$key}"
 	}
-	#say "\nOpening .md files.";
-	#if ($^O eq "darwin") {
-	#	#system `$browser https://www.blueletterbible.org/lang/lexicon/lexicon.cfm?strongs=$strong`;
-	#	system "perl $Bin/get_strongs_gist.pl";
-	#	system `$textEditor $tW_files`;
-	#	system `$textEditor $exceptions_file`;
-	#}
+	say "\nOpening .md files\n\$tW_files: $tW_files";
+	if ($^O eq "darwin") {
+		#system `$browser https://www.blueletterbible.org/lang/lexicon/lexicon.cfm?strongs=$strong`;
+		system "perl $Bin/get_strongs_gist.pl";
+		system `$textEditor $tW_files`;
+		system `$textEditor $exceptions_file`;
+	}
 	#if ($^O eq "linux") {
 	#	say "curl $intrln_ref > $Bin/Temp/This_interlinear.html";
 	#	system "curl $intrln_ref > $Bin/Temp/This_interlinear.html";
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Restructure_names.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Restructure_names.pl
@ -1,45 +0,0 @@
-use 5.12.0;
-use File::Slurp;
-use File::Find ; 
-use Cwd ;
-use utf8;
-#use open IN => ":utf8", OUT => ":utf8";
-use open IO => ":utf8";
-
-
-open(LOG, ">/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Logs/log.txt") or die "$!";
-
-my $topDir = "/Users/Henry/Documents/WACS/Restructure/bible/names";
-
-my @filesToRun = ();
-my $filePattern = '*.md' ; 
-find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
-
-foreach  my $file ( @filesToRun  )
-{
-	say LOG $file;
-	my $shortFile = $file;
-	$shortFile =~ s/^.*\/([^\/]*\.md)$/\/Users\/Henry\/Documents\/WACS\/Tips_and_Hacks\/MAST_tW_PDF_Updater\/FilesForUpdates\/Output\/names\/$1/;
-	my $fileText = read_file($file, binmode => 'utf8');
-	if ($fileText =~ /Forms Found in the English ULB/) { say LOG "\tForms Found in the English ULB"; }
-	else {
-		my ($nameLine, $mainName, $otherNames, $mainText);
-		if ($fileText =~ /^# ([^\n]*)\n(.*)$/s) {
-			($nameLine, $mainText) = ($1, $2);
-			say LOG "\$nameLine: $nameLine\n\$mainText:\n$mainText\n\n";
-			if ($nameLine =~ /^([^,]*), (.*)$/) {
-				($mainName, $otherNames) = ($1, $2);
-			} else {
-				$mainName = $nameLine
-			}
-			$fileText = "# $mainName\n\n$mainText\n\n## Forms Found in the English ULB:\n\n$nameLine";
-			$fileText =~ s/\n{3,}/\n\n/g;
-			open(OUT, ">$shortFile") or die "$!";
-			say OUT $fileText;
-			close OUT;
-		}
-	}
-	   
-} 
-
-say "Done."
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Restructure_tW_pages.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Restructure_tW_pages.pl
@ -1,62 +0,0 @@
-# Adds Synonyms and Related Words section and
-# Forms Found in the English ULB section
-# to tW pages
-
-use 5.12.0;
-use File::Slurp;
-use File::Find ; 
-use Cwd ; 
-use utf8;
-#use open IN => ":utf8", OUT => ":utf8";
-use open IO => ":utf8";
-
-open LOG, ">Logs/log.log";
-
-my $topDir = "/Users/Henry/Documents/WACS/W_Q_Restructure/bible";
-my $topOutDir = "/Users/Henry/Documents/WACS/W_Q_Restructure_new/bible";
-
-my @filesToRun = ();
-my $filePattern = '*.md' ; 
-find( sub { push @filesToRun, $File::Find::name  if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
-
-foreach  my $file ( @filesToRun  ) {
-	say $file;
-	my $fileText = read_file("$file", binmode => 'utf8');
-	my $outText = Process($fileText);
-	Output($file, $outText);
-} 
-
-close LOG;
-
-say "Done.";
-
-# =====================
-
-sub Process {
-	my $text = $_[0];
-	my ($entries, $keyWord, $bulk, $forms);
-	if ($text =~ /^# ([^\n]*)\n/) {
-		$entries = $1;
-	}
-	if ($text =~ /^# (([^\n,]*)(\n|,))/) {
-		$keyWord = $2
-	}
-	if ($text =~ /(## (Facts|Definition):.*)$/s) {
-		$bulk = $1
-	}
-	my @forms = split /, /, $entries;
-	@forms = sort @forms;
-	$forms = join(', ', @forms);
-	$text = "# $keyWord\n\n## Synonyms and Related Words:\n\n$forms\n\n$bulk\n\n## Forms Found in the English ULB\n\n$forms\n\n\n\n";
-	while ($text =~ s/\n{3,}/\n\n/g) {}
-	#$text =~ s/\n+$/\n/;
-	return $text;
-}
-
-sub Output {
-	my ($OutFile, $text) = ($_[0], $_[1]);
-	$OutFile =~ s/$topDir/$topOutDir/;
-	open(OUT, ">$OutFile") or die $!;
-	print OUT $text;
-	close OUT
-}
--- a/MAST_tW_PDF_Updater/FilesForUpdates/User/User_defaults.mac.txt
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/User/User_defaults.mac.txt
@ -11,9 +11,9 @@ Repository directory: /Users/Henry/Documents/WACS

 translationNotes path: en_tn
 Unlocked Literal Bible path: en_ulb
-# translationNotes path: gl_.*_tn
- translationWords path: gl_.*_bible.en_tw
-# Unlocked Literal Bible path: gl_.*_ulb
+# translationNotes path: en_tn
+translationWords path: en_tw/bible
+# Unlocked Literal Bible path: en_ulb
 Hebrew Bible XML directory: MAST_HB
 Greek Bible XML directory: OGNT

--- a/MAST_tW_PDF_Updater/FilesForUpdates/tWs.from.MAST.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/tWs.from.MAST.pl
@ -61,7 +61,7 @@ my (@fileList);
 # ==============================

 chdir("$pwd");
-open LOG, ">:utf8", "Logs${d}Exc_log.log" or die "\$log: Logs${d}Exc_log.log: $!";
+open LOG, ">:utf8", "Logs${d}1_Data_and_inputs.txt" or die "\$log: Logs${d}1_Data_andinputs.txt: $!";
 open OUT, ">:utf8", $output or die "$!";
 open MISSING, ">$missing" or die "$!";

@ -79,12 +79,14 @@ GetUserDefaults();
 GetULBBooksToProcess();
 ReadExceptions();
 close LOG;
-open LOG, ">:utf8", "Logs${d}tW_pairs_log.txt" or die "Logs${d}tW_pairs_log.txt: $!";
+open LOG, ">:utf8", "Logs${d}2_tW_pairs_log.txt" or die "Logs${d}2_tW_pairs_log.txt: $!";
 PairtWEntriesTotWPageAndUniqSNs();
 close LOG;
-open LOG, ">:utf8", "Logs${d}tWs_from_MAST_log.txt" or die "tWs_from_MAST_log.txt: $!";
+open LOG, ">:utf8", "Logs${d}3_tWs_from_MAST_log.txt" or die "3_tWs_from_MAST_log.txt: $!";
 GetRelevantSNsForEachVerse();
 LinkULBtoCV();
+close LOG;
+open LOG, ">:utf8", "Logs${d}4_Process_log.txt" or die "4_Process_log.txt: $!";
 ProcessEachVerse();

 say OUT $finalOutString;
@ -123,6 +125,8 @@ sub GetUserDefaults {
 		die "No path to repo found" if $repoPath eq "";
 		
 		($topTwDir, $topOTSourceLangDir, $topNTSourceLangDir) = ("$repoPath${d}$twPath", "$repoPath${d}MAST_HB", "$repoPath${d}OGNT");
+		
+		say LOG "\$topTwDir: $topTwDir\n\$topOTSourceLangDir: $topOTSourceLangDir\n\$topNTSourceLangDir: $topNTSourceLangDir	";
 	
 	close $defaults;
 }
@ -144,12 +148,13 @@ sub GetULBBooksToProcess {
 				}
 				
 				$sourceFile = "$topSourceLangDir${d}$this_bk.xml";
+				say LOG $sourceFile;
 				push @fileList, $sourceFile;
 			}
 		}
 	
 	close $file;
-	#say LOG "\@fileList:\n@fileList";
+	say LOG "===\n\@fileList:\n@fileList\n===\n";
 }

 sub ReadExceptions {
@ -162,23 +167,24 @@ sub ReadExceptions {
 		#say LOG $line;
 		my $rf;
 		if ($line =~ /^([^#\n][^\t\n]*)\t([GH]\d+)\t\|\|$/) {
-			my ($oldNew) = ($2);
+			my ($SNtoSkip) = ($2);
 			$rf = $1;
-			#say LOG "\$line: $line, \$rf: $rf, \$oldNew: $oldNew";
-			($deleteNum{$rf}) .= "$oldNew√";
+			say LOG "<1>\t\$line: $line, \$rf: $rf, \$SNtoSkip: $SNtoSkip";
+			($deleteNum{$rf}) .= "$SNtoSkip√";
 			$specifiedText{$rf} = 1;
 			#say LOG "\$specifiedText{$rf}: $specifiedText{$rf}";
-		} elsif ($line =~ /^([^#\n][^\t\n]*)\t(\d+\t\d+)/) {
+		} elsif ($line =~ /^([^#\n][^\t\n]*)\t([GH]\d+\t[GH]\d+)/) {
 			my ($oldNew) = ($2);
 			$rf = $1;
-			#say LOG "\$line: $line, \$rf: $rf, \$oldNew: $oldNew";
+			say LOG "<2>\t\$line: $line, \$rf: $rf, \$oldNew: $oldNew";
 			($adjust{$rf}) .= "$oldNew√";
 			$specifiedText{$rf} = 1;
 		}
 		elsif ($line =~ /^([^#\n\t][^\t\n]*)\t(.\d+)\t([^\t\n]*)\t([^\t\n]*)$/) {
 			my ($rf, $sn, $snippet, $page) = ($1, $2, $3, $4);
-			#say LOG "\$line: $line, \$rf: $rf, \$oldNew: $oldNew";
+			say LOG "<3>\t\$rf: $rf,	 \$sn: $sn,	 \$snippet: $snippet,	 \$page: $page	";
 			$specifiedEntries{$rf} .= "$sn≈$snippet≈$page√";
+			$relevantSNsInCV{$rf} =~ s/$sn√?//;
 			$specifiedText{$rf} = 1;
 		}

@ -206,7 +212,7 @@ sub PairtWEntriesTotWPageAndUniqSNs {
 		if ($file =~ /\/([^\/]*)\/[^\/]*\.md/) {
 			$dir{$shortFile} = $1
 		}
-		say LOG "<0>\$shortFile: $shortFile\t\$dir{$shortFile}: $dir{$shortFile}";
+		say LOG "<4>\$shortFile: $shortFile\t\$dir{$shortFile}: $dir{$shortFile}";
 		#say "|$shortFile|"; die;
 		#if ($shortFile =~ /^(kt|names)/) {
 		#my $fileText = read_file("$file", binmode => 'utf8');
@ -297,12 +303,13 @@ sub GetRelevantSNsForEachVerse {
 		if ($sourceFile =~ /(..)-...\.xml$/) {
 			$hg = "H" if ($1 < 40);
 		}
-		#say LOG "opening \$sourceFile: $sourceFile";
+		say LOG "opening \$sourceFile: $sourceFile";
 		open IN, "$sourceFile" or die "$sourceFile can't be opened\n\n";
 			my ($thisBook, $thisChap, $thisVers, $thisCV);
 			my (@pages);
 			while (<IN>) {
 				chomp;
+				say LOG "<\@>\t$_";
 				if (/<verse osisID="([^\.]*).(\d+).(\d+)">/) {
 					#say LOG "$thisCV: \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";# Making sure previous verse is populated
 					my ($bk, $ch, $vs) = ($1, $2, $3);
@ -315,15 +322,20 @@ sub GetRelevantSNsForEachVerse {
 					#say LOG "##\t$bk $ch:$vs, $thisCV";
 				}
 				else {
-					s/(lemma=").*?(\d+).*?("\n)/$1$2$3/;
-					while (/<w lemma="(\d+)"/g) {
-						#say LOG $_;
-						my ($thisSN) = ($hg . $1);
-						#say LOG "\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
-						if (exists $relevantSNs{$thisSN}) {
-							$relevantSNsInCV{$thisCV} .= "$thisSN√" unless ($relevantSNsInCV{$thisCV} =~ /\b$thisSN\b/);
+					if (/lemma="([^"]*)"/) {
+						my $gist = $1;
+						say LOG "<\@\@>\t\$gist: $gist";
+						if ($gist =~ /\d+/) {
+							s/(lemma=")[^\d]*?(\d+)[^\d]*?(")/$1$2$3/;
+							while (/<w lemma="(\d+)"/g) {
+								my ($thisSN) = ($hg . $1);
+								say LOG "<\@\@\@>\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
+								if (exists $relevantSNs{$thisSN}) {
+									$relevantSNsInCV{$thisCV} .= "$thisSN√" unless ($relevantSNsInCV{$thisCV} =~ /\b$thisSN\b/);
+								}
+								say LOG ">\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
+							}
 						}
-						#say LOG ">\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
 					}
 				}
 			}
@ -348,16 +360,18 @@ sub ProcessEachVerse  {
 	foreach my $key (sort keys %orderRef) {
 	# for each verse
 		my ($thisCV) = ($orderRef{$key});
-		say LOG "\n<1>\n$thisCV\t$ULBtext{$thisCV}\n\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}\t\$deleteNum{$thisCV}: $deleteNum{$thisCV}, \$specifiedEntries{$thisCV}: $specifiedEntries{$thisCV}";
+		say LOG "\n<5>\n$thisCV\t$ULBtext{$thisCV}\n\t\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}\n\t\$deleteNum{$thisCV}: $deleteNum{$thisCV},\n\t\$specifiedEntries{$thisCV}: $specifiedEntries{$thisCV}";
+		
+		($relevantSNsInCV{$thisCV}) = DeleteSpecifiedWords ($relevantSNsInCV{$thisCV}, $specifiedEntries{$thisCV});

 		($relevantSNsInCV{$thisCV}) = DeleteObviatedSNs($relevantSNsInCV{$thisCV}, $deleteNum{$thisCV});
 		# delete obviated SNs

-		say LOG "\t\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
+		say LOG "<6>\t\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
 		my $processSequence = "$specifiedEntries{$thisCV}√$relevantSNsInCV{$thisCV}";
 		$processSequence =~ s/√+/√/g;
 		$processSequence =~ s/^√+//;
-		say LOG "\t\t\$processSequence: $processSequence";
+		say LOG "\t\$processSequence: $processSequence";

 		$finalOutString .= ExecuteProcessSequence($thisCV, $processSequence, $ULBtext{$thisCV});

@ -365,9 +379,22 @@ sub ProcessEachVerse  {
 	
 }

+sub DeleteSpecifiedWords {
+	my ($sns, $toDelete) = @_;
+	say LOG "<5.1>\t\$sns:\t\t$sns\n\t\t\$toDelete:\t$toDelete";
+	my @delete = split /√/, $toDelete;
+	foreach my $one (@delete) {
+		say LOG "<5.1.1>\t\$one: $one";
+		$one =~ s/^([^≈]*)≈.*$/$1/;
+		say LOG "<5.1.2>\t\$one: $one";
+		$sns =~ s/^(.*)$one(√)?(.*)$/$1$2$3/;
+	}
+	return $sns;
+}
+
 sub DeleteObviatedSNs {
 	my ($sns, $toDelete) = @_;
-	my @sns = split /√/, $sns;
+	say LOG "<5.2>\t\$sns:\t\t$sns\n\t\t\$toDelete:\t$toDelete";
 	my @delete = split /√/, $toDelete;
 	foreach my $one (@delete) {
 		$sns =~ s/^(.*)$one(√)?(.*)$/$1$2$3/;
@ -379,7 +406,7 @@ sub ExecuteProcessSequence {
 	my ($tempText, $thisCVOutString, $position, $outputFormRef) = ($trueText, "");
 	my (%snippetSequence);
 	my (@SNsequence) = split /√/, $sequence;
-	say LOG "$ref: @SNsequence";
+	say LOG "$ref:\n@SNsequence";
 	if ($ref =~ /^([^:]*) (\d+):(\d+)/) {
 		$outputFormRef = "$1,$2,$3"
 	}
@ -388,13 +415,15 @@ sub ExecuteProcessSequence {
 		# for each relevant SN in verse
 		# 	for each tW entry
 		#		if specified tW
-		say LOG "=====\n\$ref: $ref\t\$candidate: $candidate\t\$entriesThisSN{$candidate}: $entriesThisSN{$candidate}";
+		say LOG "=====\n\$ref: $ref\t\$candidate: $candidate\t\$entriesThisSN{$candidate}: $entriesThisSN{$candidate}\n$tempText";
 		my ($found, $sn, $ulbWord, $tWpage);
 		if ($candidate =~ /([^≈]*)≈([^≈]*)≈([^≈]*)/) {
 		#			get position in true text to array
 		#			delete found text from temp text
 			($sn, $ulbWord, $tWpage) = ($1,$2,$3);
+			while ($ulbWord =~ s/^(.*) \.\.\. (.*) \.\.\. (.*)/($1)\\b(.*?)\\b($2)\\b(.*?)\\b($3)/) {}
 			while ($ulbWord =~ s/^(.*) \.\.\. (.*)/($1)\\b(.*?)\\b($2)/) {}
+			say LOG "<A>\t\$ulbWord: $ulbWord";
 			if ($tempText =~ s/^(.*)\b$ulbWord\b(.*)$/$1$2/) {
 				$position = length $1;
 				$snippetSequence{$position} = "$ulbWord,$dir{$tWpage},$tWpage";
@ -416,16 +445,18 @@ sub ExecuteProcessSequence {
 		#					get ULB snippet to verse match list
 		#					get position in true text to array
 		#					delete found text from temp text
-			if ($thisEntry =~ /\(\.\*\?\)/ && $tempText =~ s/^(.*)\b($thisEntry)\b(.*)$/$1$3/i) {
-				print LOG "<2>\t\$thisEntry |$thisEntry| is found in the first test\n___";
-				$tempText =~ s/^(.*)\b$thisEntry\b(.*)$/$1$2/;
+			if ($thisEntry =~ /\(\.\*\?\)/ && $tempText =~ /$thisEntry/i) {
+				say LOG "<7>\t\$thisEntry |$thisEntry| is found in the first test";
+				if ($tempText =~ s/^(.*)\b($thisEntry)\b(.*)$/$1$3$4/i) {
+					say LOG "<7.1>\t\$1: $1	\$2: $2	\$3: $3	\$4: $4\t\$5: $5";
+				}
 				if ($trueText =~ /^(.*)\b($thisEntry)\b.*$/) {
 					$position = length $1;
 				}
 				$snippetSequence{$position} = "$thisEntry,$dir{$pagesThisEntry{$thisEntry}},$pagesThisEntry{$thisEntry}";
 					$found = 1;
 					goto Breakout;
-				}
+			}
 			elsif ($tempText =~ s/\b($thisEntry)[^\w']//i || $tempText =~ s/\b($thisEntry)["']//i || $tempText =~ s/["']($thisEntry)\b//i) {
 				say LOG "\$thisEntry |$thisEntry| is found in the second test\n---
 				";