Version 0.3 of the Repository conversion script: convert.py

2017-10-30 22:47:47 -04:00 · 2017-10-30 22:47:47 -04:00 · b7c68fd0d9
parent e0abb6f102
commit b7c68fd0d9
1 changed files with 83 additions and 95 deletions
--- a/convert.py
+++ b/convert.py
@ -1,111 +1,96 @@
-#version 0.2
-#by John Wood - for Tech Advance
-# This script reads through the monolithic .md files in a DokuWiki copy of OBS and splits them out into chunks in folders
-# named like the .md files. The chunks are named 1.txt to n.txt where `n` is the last chunk. The folders and files are
-# automatically named correctly. Because the script doesn't generate a manifest.json I create a project in translationStudio
-# and then merge the folders created by this script into the translation folder created by translationStudio.
+#convert.py
+#version 0.3
+# by John Wood -- for Tech Advance
+# This script reads through the monolithic .md files in a DokuWiki copy of OBS and splits them
+# out into chunks in folders named like the .md files. The chunks are named 01.txt to nn.txt
+# where `nn` is the last chunk.
+# The folders and files are automatically named correctly. The script now creates a
+# manifest.json file as well.

 # Usage: python convert.py <path to DokuWiki OBS files>

-# Import necessary python components
-import os
-# os is used for file system commands
-import re
-# re is used for regular expressions
-import sys
-# sys is used for command line arguments
-import shutil
-# shutil is high-level file operations
+#Import necessary python components

-from subprocess import call
-# to fork for git
+import os       # file system commands
+import re       # regular expressions
+import sys      # command line arguments
+import shutil   # high level file operations

-# print "Starting the conversion process"
+from subprocess import call # to fork for git

-program_name=sys.argv[0];
-arguments=sys.argv[1:];
-count_args=len(arguments);
-if count_args !=1 :
-    # print "Usage: convert old_dir"
-    sys.exit(1);
-    
-convertdir = sys.argv[1];
-projectinfo = convertdir.split("_");
-oldpath=projectinfo[0];
-path=oldpath.split("/");
-language=path[len(path)-1];
-book=projectinfo[1];
-
-#projectType=projectinfo[2];
-if len(projectinfo)!=2:
-    print"This may not be a DokuWiki OBS project, and I'm not sure how to handle it";
+program_name=sys.argv[0]
+arguments=sys.argv[1:]
+count_args=len(arguments)
+if count_args !=1 : #If there is not exactly one argument, fail with a usage remark.
+    print ("convert.py script to convert DokuWiki OBS to translationStudio format")
+    print ("Usage: python converty.py <old directory>")
    sys.exit(1)
-elif book=="obs":
-    # munge the filename to get the name of the resulting folder.
-    targetpath=convertdir+"_text_obs"
    
-# print "The language is "+language;
-# print "Source directory is "+convertdir;
-# print "Target directory is "+targetpath;
+convertdir=sys.argv[1]
+projectinfo=convertdir.split("_") # splitting the argument on undescores will give us the
+                                   # information about the OBS project
+oldpath=projectinfo[0] # The first element of the argument is the path to the old project
+path=oldpath.split("/") # we can then learn more about the project from its path
+language=path[len(path)-1] # The last element in the path is the language code. We have to
+                            # subtract one because the first element is 0, rather than 1

-# Create the manifest file
+book=projectinfo[1] # The book name, then, should be the second part of the project name

-for filename in os.listdir(convertdir+"/content/"):
-  #run the script in the target directory
-    # Check for special folders first
-    if (filename=="front"):
+if len(projectinfo)!=2: # tS projects look like lll_obs_text_obs rather than lll_obs
+    print("This may not be a DokuWiki OBS project and I'm not sure how to handle it")
+    sys.exit(1)
+    
+elif book=="obs":
+    targetpath=convertdir+"_text_obs" #this gets the target name into the right format
+    
+worksite=convertdir+"/content/"
+
+for filename in os.listdir(worksite): #the actual content is in a subdirectory
+    if(filename=="front"):
        if not os.path.exists(targetpath+"/front"):
            os.makedirs(targetpath+"/front")
-        for filename2 in os.listdir(convertdir+"/content/"+filename+"/"):
-            shutil.copyfile(convertdir+"/content/"+filename+"/"+filename2,targetpath+"/front/"+filename2.replace(".md",".txt"));
-    # Else, if the filename is a MarkDown file
-    elif filename.endswith(".md"):
-        filenum = 00;
-        # We start by making a folder/directory matching the name of the Markdown file
-        newpath = filename.replace(".md","");
-        newpath=targetpath+"/"+newpath;
+        for filename2 in os.listdir(worksite+filename+"/"):
+            shutil.copyfile(worksite+filename+"/"+filename2, targetpath+"/front/"+filename2.replace(".md",".txt"))
+            #copy the file to the new location, changing its extension to .txt
+    elif filename.endswith(".md"): # all other files we deal with are MarkDown files
+        filenum=01
+        # We start by making a folder/directory matching the name of the md file
+        newpath=filename.replace(".md","")
+        newpath=targetpath+"/"+newpath
        if not os.path.exists(newpath):
            os.makedirs(newpath)
-            
-        worksite = convertdir + "/content/";
-        filename=worksite+filename;
+        filename=worksite+filename
        
-        # Then, opening the file to scan it
-        with open(filename) as mdfile:
-            # Parsing the file by line
+        with open(filename) as mdfile: #we open the old md to scan it
            for line in mdfile:
-                # print "Working with line: "+line.strip("\s")
-                # Lines with ![Image are image references not needed in the translationStudio project
-                if re.match('\!\[Image|\[\[https',line):
-                    # print " Found a line matching ![Image"
-                    filenum = filenum + 1
-                    # print "  Filenumber is "+format(filenum)
-                elif re.match("#",line):
-                    # print "   Found a line Matching # -- this is a Title"
-                    # print "    Writing to "+newpath;
-                    myTitle = newpath + "/title.txt"
-                    with open(myTitle, "a+") as newfile:
-                        # We write the same line to the new file, but erase the hashes (#)
-                        newfile.write(line.replace("#",""));
+                if re.match("\!\[Image\]",line):
+                    #line is an image: increment the file counter
+                    newFileName = newpath + "/{:0>2d}".format(filenum)+".txt"
+                    filenum = filenum+1
+                    writeLine=""
+                elif re.match(r'\[\[https',line):
+                    #line is an image: increment the file counter
+                    newFileName = newpath + "/{:0>2d}".format(filenum)+".txt"
+                    filenum = filenum+1
+                    writeLine=""
                elif re.match("_",line):
-                    # Lines with underscores are the references
-                    myRef = newpath + "/reference.txt"
-                    with open(myRef, "a+") as newfile:
-                        #we write the same line to the new file, but erase the underscores
-                        newfile.write(line.replace("_",""));
-                # Only process lines that aren't blank
+                    #lines with underscores are the references
+                    newFileName = newpath+"/reference.txt"
+                    writeLine=line.replace("_","")
+                elif re.match("#",line):
+                    #matching title
+                    newFileName = newpath+"/title.txt"
+                    writeLine=line.replace("#","")
                elif not line.strip("\s")=="\n":
-                    # print "Line "+'"'+line.strip("\s")+'"'+" is blank"
-                #else:
-                    # print "     The current file number is "+format(filenum)
-                    if filenum==0: filenum=1
-                    newFileName = "{:0>2d}".format(filenum)+".txt";
-                    # print "      Writing the text file - "+newFileName
-                    myNewFile = newpath + "/"+newFileName
-                    with open(myNewFile, "a+") as newfile:
-                        newfile.write(line)
+                    # checking that the line isn't blank
+                    writeLine=line
+                elif line.strip("\s")=="\n":
+                    writeLine=""
+                with open(newFileName, "a+") as newfile:
+                    newfile.write(writeLine)
+
+newfile.close()

-print "Generating the manifest"
 with open(convertdir+"/manifest.yaml") as manfile:
    with open(targetpath+"/manifest.json","a+") as newmanfile:
        for manline in manfile:
@ -113,11 +98,11 @@ with open(convertdir+"/manifest.yaml") as manfile:
            direction_match = re.search(r"    direction: (\w+)",manline)
            modified_date_match= re.search(r"  modified: '(\d\d\d\d-\d\d-\d\d)'",manline)
            if modified_date_match:
-                modified_date=modified_date_match.group(1);
+                modified_date=modified_date_match.group(1)
            elif title_match:
-                target_language_name=title_match.group(1);
+                target_language_name=title_match.group(1)
            elif direction_match:
-                target_direction = direction_match.group(1);
+                target_direction = direction_match.group(1)
                
        newmanfile.write('\n'.join([
            '{',
@ -156,7 +141,10 @@ with open(convertdir+"/manifest.yaml") as manfile:
            '	"parent_draft": {},',
            '	"translators": [],',
            '	"finished_chunks": []',
-            '}']));
+            '}']))
            
-#gitCommand = "/usr/bin/git init "+targetpath;
-#call("/usr/bin/git init", targetpath)
+os.chdir(targetpath)
+call(["git","init"])
+call(["git","add","."])
+call(["git","commit","-m Initial commit"])
+print ("New project written in "+targetpath)