Version 0.3 of the Repository conversion script: convert.py
This commit is contained in:
parent
e0abb6f102
commit
b7c68fd0d9
178
convert.py
178
convert.py
|
@ -1,111 +1,96 @@
|
|||
#version 0.2
|
||||
#by John Wood - for Tech Advance
|
||||
# This script reads through the monolithic .md files in a DokuWiki copy of OBS and splits them out into chunks in folders
|
||||
# named like the .md files. The chunks are named 1.txt to n.txt where `n` is the last chunk. The folders and files are
|
||||
# automatically named correctly. Because the script doesn't generate a manifest.json I create a project in translationStudio
|
||||
# and then merge the folders created by this script into the translation folder created by translationStudio.
|
||||
#convert.py
|
||||
#version 0.3
|
||||
# by John Wood -- for Tech Advance
|
||||
# This script reads through the monolithic .md files in a DokuWiki copy of OBS and splits them
|
||||
# out into chunks in folders named like the .md files. The chunks are named 01.txt to nn.txt
|
||||
# where `nn` is the last chunk.
|
||||
# The folders and files are automatically named correctly. The script now creates a
|
||||
# manifest.json file as well.
|
||||
|
||||
# Usage: python convert.py <path to DokuWiki OBS files>
|
||||
|
||||
# Import necessary python components
|
||||
import os
|
||||
# os is used for file system commands
|
||||
import re
|
||||
# re is used for regular expressions
|
||||
import sys
|
||||
# sys is used for command line arguments
|
||||
import shutil
|
||||
# shutil is high-level file operations
|
||||
#Import necessary python components
|
||||
|
||||
from subprocess import call
|
||||
# to fork for git
|
||||
import os # file system commands
|
||||
import re # regular expressions
|
||||
import sys # command line arguments
|
||||
import shutil # high level file operations
|
||||
|
||||
# print "Starting the conversion process"
|
||||
from subprocess import call # to fork for git
|
||||
|
||||
program_name=sys.argv[0];
|
||||
arguments=sys.argv[1:];
|
||||
count_args=len(arguments);
|
||||
if count_args !=1 :
|
||||
# print "Usage: convert old_dir"
|
||||
sys.exit(1);
|
||||
|
||||
convertdir = sys.argv[1];
|
||||
projectinfo = convertdir.split("_");
|
||||
oldpath=projectinfo[0];
|
||||
path=oldpath.split("/");
|
||||
language=path[len(path)-1];
|
||||
book=projectinfo[1];
|
||||
|
||||
#projectType=projectinfo[2];
|
||||
if len(projectinfo)!=2:
|
||||
print"This may not be a DokuWiki OBS project, and I'm not sure how to handle it";
|
||||
program_name=sys.argv[0]
|
||||
arguments=sys.argv[1:]
|
||||
count_args=len(arguments)
|
||||
if count_args !=1 : #If there is not exactly one argument, fail with a usage remark.
|
||||
print ("convert.py script to convert DokuWiki OBS to translationStudio format")
|
||||
print ("Usage: python converty.py <old directory>")
|
||||
sys.exit(1)
|
||||
elif book=="obs":
|
||||
# munge the filename to get the name of the resulting folder.
|
||||
targetpath=convertdir+"_text_obs"
|
||||
|
||||
# print "The language is "+language;
|
||||
# print "Source directory is "+convertdir;
|
||||
# print "Target directory is "+targetpath;
|
||||
convertdir=sys.argv[1]
|
||||
projectinfo=convertdir.split("_") # splitting the argument on undescores will give us the
|
||||
# information about the OBS project
|
||||
oldpath=projectinfo[0] # The first element of the argument is the path to the old project
|
||||
path=oldpath.split("/") # we can then learn more about the project from its path
|
||||
language=path[len(path)-1] # The last element in the path is the language code. We have to
|
||||
# subtract one because the first element is 0, rather than 1
|
||||
|
||||
# Create the manifest file
|
||||
book=projectinfo[1] # The book name, then, should be the second part of the project name
|
||||
|
||||
for filename in os.listdir(convertdir+"/content/"):
|
||||
#run the script in the target directory
|
||||
# Check for special folders first
|
||||
if (filename=="front"):
|
||||
if len(projectinfo)!=2: # tS projects look like lll_obs_text_obs rather than lll_obs
|
||||
print("This may not be a DokuWiki OBS project and I'm not sure how to handle it")
|
||||
sys.exit(1)
|
||||
|
||||
elif book=="obs":
|
||||
targetpath=convertdir+"_text_obs" #this gets the target name into the right format
|
||||
|
||||
worksite=convertdir+"/content/"
|
||||
|
||||
for filename in os.listdir(worksite): #the actual content is in a subdirectory
|
||||
if(filename=="front"):
|
||||
if not os.path.exists(targetpath+"/front"):
|
||||
os.makedirs(targetpath+"/front")
|
||||
for filename2 in os.listdir(convertdir+"/content/"+filename+"/"):
|
||||
shutil.copyfile(convertdir+"/content/"+filename+"/"+filename2,targetpath+"/front/"+filename2.replace(".md",".txt"));
|
||||
# Else, if the filename is a MarkDown file
|
||||
elif filename.endswith(".md"):
|
||||
filenum = 00;
|
||||
# We start by making a folder/directory matching the name of the Markdown file
|
||||
newpath = filename.replace(".md","");
|
||||
newpath=targetpath+"/"+newpath;
|
||||
for filename2 in os.listdir(worksite+filename+"/"):
|
||||
shutil.copyfile(worksite+filename+"/"+filename2, targetpath+"/front/"+filename2.replace(".md",".txt"))
|
||||
#copy the file to the new location, changing its extension to .txt
|
||||
elif filename.endswith(".md"): # all other files we deal with are MarkDown files
|
||||
filenum=01
|
||||
# We start by making a folder/directory matching the name of the md file
|
||||
newpath=filename.replace(".md","")
|
||||
newpath=targetpath+"/"+newpath
|
||||
if not os.path.exists(newpath):
|
||||
os.makedirs(newpath)
|
||||
|
||||
worksite = convertdir + "/content/";
|
||||
filename=worksite+filename;
|
||||
filename=worksite+filename
|
||||
|
||||
# Then, opening the file to scan it
|
||||
with open(filename) as mdfile:
|
||||
# Parsing the file by line
|
||||
with open(filename) as mdfile: #we open the old md to scan it
|
||||
for line in mdfile:
|
||||
# print "Working with line: "+line.strip("\s")
|
||||
# Lines with ![Image are image references not needed in the translationStudio project
|
||||
if re.match('\!\[Image|\[\[https',line):
|
||||
# print " Found a line matching ![Image"
|
||||
filenum = filenum + 1
|
||||
# print " Filenumber is "+format(filenum)
|
||||
elif re.match("#",line):
|
||||
# print " Found a line Matching # -- this is a Title"
|
||||
# print " Writing to "+newpath;
|
||||
myTitle = newpath + "/title.txt"
|
||||
with open(myTitle, "a+") as newfile:
|
||||
# We write the same line to the new file, but erase the hashes (#)
|
||||
newfile.write(line.replace("#",""));
|
||||
if re.match("\!\[Image\]",line):
|
||||
#line is an image: increment the file counter
|
||||
newFileName = newpath + "/{:0>2d}".format(filenum)+".txt"
|
||||
filenum = filenum+1
|
||||
writeLine=""
|
||||
elif re.match(r'\[\[https',line):
|
||||
#line is an image: increment the file counter
|
||||
newFileName = newpath + "/{:0>2d}".format(filenum)+".txt"
|
||||
filenum = filenum+1
|
||||
writeLine=""
|
||||
elif re.match("_",line):
|
||||
# Lines with underscores are the references
|
||||
myRef = newpath + "/reference.txt"
|
||||
with open(myRef, "a+") as newfile:
|
||||
#we write the same line to the new file, but erase the underscores
|
||||
newfile.write(line.replace("_",""));
|
||||
# Only process lines that aren't blank
|
||||
#lines with underscores are the references
|
||||
newFileName = newpath+"/reference.txt"
|
||||
writeLine=line.replace("_","")
|
||||
elif re.match("#",line):
|
||||
#matching title
|
||||
newFileName = newpath+"/title.txt"
|
||||
writeLine=line.replace("#","")
|
||||
elif not line.strip("\s")=="\n":
|
||||
# print "Line "+'"'+line.strip("\s")+'"'+" is blank"
|
||||
#else:
|
||||
# print " The current file number is "+format(filenum)
|
||||
if filenum==0: filenum=1
|
||||
newFileName = "{:0>2d}".format(filenum)+".txt";
|
||||
# print " Writing the text file - "+newFileName
|
||||
myNewFile = newpath + "/"+newFileName
|
||||
with open(myNewFile, "a+") as newfile:
|
||||
newfile.write(line)
|
||||
# checking that the line isn't blank
|
||||
writeLine=line
|
||||
elif line.strip("\s")=="\n":
|
||||
writeLine=""
|
||||
with open(newFileName, "a+") as newfile:
|
||||
newfile.write(writeLine)
|
||||
|
||||
newfile.close()
|
||||
|
||||
print "Generating the manifest"
|
||||
with open(convertdir+"/manifest.yaml") as manfile:
|
||||
with open(targetpath+"/manifest.json","a+") as newmanfile:
|
||||
for manline in manfile:
|
||||
|
@ -113,11 +98,11 @@ with open(convertdir+"/manifest.yaml") as manfile:
|
|||
direction_match = re.search(r" direction: (\w+)",manline)
|
||||
modified_date_match= re.search(r" modified: '(\d\d\d\d-\d\d-\d\d)'",manline)
|
||||
if modified_date_match:
|
||||
modified_date=modified_date_match.group(1);
|
||||
modified_date=modified_date_match.group(1)
|
||||
elif title_match:
|
||||
target_language_name=title_match.group(1);
|
||||
target_language_name=title_match.group(1)
|
||||
elif direction_match:
|
||||
target_direction = direction_match.group(1);
|
||||
target_direction = direction_match.group(1)
|
||||
|
||||
newmanfile.write('\n'.join([
|
||||
'{',
|
||||
|
@ -156,7 +141,10 @@ with open(convertdir+"/manifest.yaml") as manfile:
|
|||
' "parent_draft": {},',
|
||||
' "translators": [],',
|
||||
' "finished_chunks": []',
|
||||
'}']));
|
||||
'}']))
|
||||
|
||||
#gitCommand = "/usr/bin/git init "+targetpath;
|
||||
#call("/usr/bin/git init", targetpath)
|
||||
os.chdir(targetpath)
|
||||
call(["git","init"])
|
||||
call(["git","add","."])
|
||||
call(["git","commit","-m Initial commit"])
|
||||
print ("New project written in "+targetpath)
|
Loading…
Reference in New Issue