TypeScript/scripts/word2md.ts

374 lines
11 KiB
TypeScript
Raw Normal View History

// word2md - Word to Markdown conversion tool
//
// word2md converts a Microsoft Word document to Markdown formatted text. The tool uses the
// Word Automation APIs to start an instance of Word and access the contents of the document
// being converted. The tool must be run using the cscript.exe script host and requires Word
// to be installed on the target machine. The name of the document to convert must be specified
// as a command line argument and the resulting Markdown is written to standard output. The
// tool recognizes the specific Word styles used in the TypeScript Language Specification.
module Word {
export interface Collection<T> {
count: number;
item(index: number): T;
}
export interface Font {
bold: boolean;
italic: boolean;
subscript: boolean;
superscript: boolean;
}
export interface Find {
font: Font;
format: boolean;
replacement: Replacement;
style: any;
text: string;
clearFormatting(): void;
execute(
findText: string,
matchCase: boolean,
matchWholeWord: boolean,
matchWildcards: boolean,
matchSoundsLike: boolean,
matchAllWordForms: boolean,
forward: boolean,
wrap: number,
format: boolean,
replaceWith: string,
replace: number): boolean;
}
export interface Replacement {
font: Font;
style: any;
text: string;
clearFormatting(): void;
}
export interface ListFormat {
listLevelNumber: number;
listString: string;
}
export interface Column {
}
export interface Columns extends Collection<Column> {
}
export interface Table {
columns: Columns;
}
export interface Tables extends Collection<Table> {
}
export interface Range {
find: Find;
listFormat: ListFormat;
tables: Tables;
text: string;
words: Ranges;
}
export interface Ranges extends Collection<Range> {
}
export interface Style {
nameLocal: string;
}
export interface Paragraph {
alignment: number;
range: Range;
style: Style;
next(): Paragraph;
}
export interface Paragraphs extends Collection<Paragraph> {
first: Paragraph;
}
export interface Field {
}
export interface Fields extends Collection<Field> {
toggleShowCodes();
}
export interface Document {
fields: Fields;
paragraphs: Paragraphs;
close(saveChanges: boolean): void;
range(): Range;
}
export interface Documents extends Collection<Document> {
open(filename: string): Document;
}
export interface Application {
documents: Documents;
quit(): void;
}
}
var sys = (function () {
var args: string[] = [];
for (var i = 0; i < WScript.Arguments.length; i++) {
args[i] = WScript.Arguments.Item(i);
}
return {
args: args,
createObject: (typeName: string) => new ActiveXObject(typeName),
write: (s: string) => WScript.StdOut.Write(s)
};
})();
function convertDocumentToMarkdown(doc: Word.Document): string {
var result: string = "";
var lastStyle: string;
var lastInTable: boolean;
var tableColumnCount: number;
var tableCellIndex: number;
var columnAlignment: number[] = [];
function reformatSubscripts() {
var find = doc.range().find;
find.clearFormatting();
find.font.subscript = true;
var replace = find.replacement;
replace.clearFormatting();
replace.font.subscript = false;
find.execute("", false, false, false, false, false, true, 0, true, "<sub>^&</sub>", 2);
}
function reformatCodeFragments() {
var find = doc.range().find;
find.clearFormatting();
find.style = "Code Fragment";
var replace = find.replacement;
replace.clearFormatting();
replace.style = -66; // Default Paragraph Font
find.execute("", false, false, false, false, false, true, 0, true, "`^&`", 2);
}
function reformatProductions() {
var find = doc.range().find;
find.clearFormatting();
find.style = "Production";
var replace = find.replacement;
replace.clearFormatting();
replace.style = -66; // Default Paragraph Font
find.execute("", false, false, false, false, false, true, 0, true, "*^&*", 2);
}
function reformatTerminals() {
var find = doc.range().find;
find.clearFormatting();
find.style = "Terminal";
var replace = find.replacement;
replace.clearFormatting();
replace.style = -66; // Default Paragraph Font
find.execute("", false, false, false, false, false, true, 0, true, "`^&`", 2);
}
function reformatBoldItalic() {
var find = doc.range().find;
find.clearFormatting();
find.font.bold = true;
find.font.italic = true;
var replace = find.replacement;
replace.clearFormatting();
replace.font.bold = false;
replace.font.italic = false;
find.execute("", false, false, false, false, false, true, 0, true, "***^&***", 2);
}
function reformatItalic() {
var find = doc.range().find;
find.clearFormatting();
find.font.italic = true;
var replace = find.replacement;
replace.clearFormatting();
replace.font.italic = false;
find.execute("", false, false, false, false, false, true, 0, true, "*^&*", 2);
}
function reformatReferences() {
doc.fields.toggleShowCodes();
var find = doc.range().find;
find.clearFormatting();
var replace = find.replacement;
replace.clearFormatting();
find.execute("^19 REF", false, false, false, false, false, true, 0, true, "[^&](#^&)", 2);
doc.fields.toggleShowCodes();
}
function write(s: string) {
result += s;
}
function writeTableHeader() {
for (var i = 0; i < tableColumnCount - 1; i++) {
switch (columnAlignment[i]) {
case 1:
write("|:---:");
break;
case 2:
write("|---:");
break;
default:
write("|---");
}
}
write("|\n");
}
function stripFormattingMarks(text: string) {
var i = text.length;
while (i > 0 && text.charCodeAt(i - 1) < 0x20) i--;
return text.substr(0, i);
}
function writeBlockEnd() {
switch (lastStyle) {
case "Code":
write("```\n\n");
break;
case "List Paragraph":
case "Table":
case "TOC":
write("\n");
break;
}
}
function writeParagraph(p: Word.Paragraph) {
var text = p.range.text;
var style = p.style.nameLocal;
var inTable = p.range.tables.count > 0;
var level = 1;
var sectionBreak = text.indexOf("\x0C") >= 0;
text = stripFormattingMarks(text);
if (inTable) {
style = "Table";
}
else if (style.match(/\s\d$/)) {
level = +style.substr(style.length - 1);
style = style.substr(0, style.length - 2);
}
if (lastStyle && style !== lastStyle) {
writeBlockEnd();
}
switch (style) {
case "Heading":
case "Appendix":
var section = p.range.listFormat.listString;
write("####".substr(0, level) + " <a name=\"" + section + "\"/>" + section + " " + text + "\n\n");
break;
case "Normal":
if (text.length) {
write(text + "\n\n");
}
break;
case "List Paragraph":
write(" ".substr(0, p.range.listFormat.listLevelNumber * 2 - 2) + "* " + text + "\n");
break;
case "Grammar":
write("&emsp;&emsp;" + text.replace(/\s\s\s/g, "&emsp;").replace(/\x0B/g, " \n&emsp;&emsp;&emsp;") + "\n\n");
break;
case "Code":
if (lastStyle !== "Code") {
write("```TypeScript\n");
}
else {
write("\n");
}
write(text.replace(/\x0B/g, " \n") + "\n");
break;
case "Table":
if (!lastInTable) {
tableColumnCount = p.range.tables.item(1).columns.count + 1;
tableCellIndex = 0;
}
if (tableCellIndex < tableColumnCount) {
columnAlignment[tableCellIndex] = p.alignment;
}
write("|" + text);
tableCellIndex++;
if (tableCellIndex % tableColumnCount === 0) {
write("\n");
if (tableCellIndex === tableColumnCount) {
writeTableHeader();
}
}
break;
case "TOC Heading":
write("## " + text + "\n\n");
break;
case "TOC":
var strings = text.split("\t");
write(" ".substr(0, level * 2 - 2) + "* [" + strings[0] + " " + strings[1] + "](#" + strings[0] + ")\n");
break;
}
if (sectionBreak) {
write("<br/>\n\n");
}
lastStyle = style;
lastInTable = inTable;
}
function writeDocument() {
var p = doc.paragraphs.first;
while (p) {
writeParagraph(p);
p = p.next();
}
writeBlockEnd();
}
reformatSubscripts();
reformatCodeFragments();
reformatProductions();
reformatTerminals();
reformatBoldItalic();
reformatItalic();
reformatReferences();
writeDocument();
return result;
}
function main(args: string[]) {
if (args.length !== 1) {
sys.write("Syntax: word2md <filename>\n");
return;
}
var app: Word.Application = sys.createObject("Word.Application");
var doc = app.documents.open(args[0]);
sys.write(convertDocumentToMarkdown(doc));
doc.close(false);
app.quit();
}
main(sys.args);