#!/usr/bin/env nodejsscript /* jshint esversion: 11,-W097, -W040, module: true, node: true, expr: true, undef: true *//* global echo, $, pipe, s, fetch, cyclicLoop */ const tmp= "tmp-"; //pdftk has issue reading from /tmp, so need to put it somewhere to the same folder :-( if(!s.which("pdftk")) $.error("pdftk not found"); $.api() .version("2025-08-05") .describe([ "A small wrapper around 'pdftk' to extract data from pdf into the JSON.", "Or to update the PDF with data from JSON.", "Bookmars are converted to JSON object with key=PageNumber, value= Title (no. of spaces= level-1).", ]) .option("--debug", "Debug mode") .command("extract [file_info]", "Extract data from PDF.") .action(function extractCMD(file_pdf, file_info, { debug }){ if(!s.test("-f", file_pdf)) $.error("PDF File not found"); if(!file_info) file_info= filename(file_pdf) + ".json"; const info= extract(file_pdf); s.echo(info).to(file_info); $.exit(0); }) .command("update [file_info]", "Update PDF with data from JSON.") .action(function update(file_pdf, file_info, { debug }){ if(!s.test("-f", file_pdf)) $.error("PDF File not found"); if(!file_info) file_info= filename(file_pdf) + ".json"; if(!s.test("-f", file_info)) $.error("Info File not found"); const infoIsHtml= file_info.endsWith(".html"); const info= infoIsHtml ? infoFromHTML(file_info, file_pdf, debug) : infoFromJSON(file_info); const temp= `${tmp}${tmpname(file_pdf)}.info`; s.echo(info).to(temp); const tmp_pdf= `${tmp}${tmpname(file_pdf)}.pdf`; s.cp(file_pdf, tmp_pdf); s.run`pdftk ${tmp_pdf} update_info_utf8 ${temp} output ${file_pdf}`; if(!debug){ s.rm(tmp_pdf); s.rm(temp); } $.exit(0); }) .command("convert ", "Converts between JSON and raw text.") .action(function convert(file_info){ if(!s.test("-f", file_info)) $.error("Info File not found"); const ext= file_info.slice(file_info.lastIndexOf(".")); const info= ext===".json" ? infoFromJSON(file_info) : infoToJSON(file_info); echo(info); $.exit(0); }) .parse(); function extract(file_pdf, debug){ const temp= `${tmp}${tmpname(file_pdf)}.info` ; s.run`pdftk ${file_pdf} dump_data_utf8 output ${temp}`; const out= infoToJSON(temp); if(!debug) s.rm(temp); return out; } function filename(path){ return path.slice(path.lastIndexOf("/")+1, path.lastIndexOf(".")); } function tmpname(path){ return filename(path) + "-" + Date.now(); } function infoFromHTML(file_info, file_pdf, debug){ const info_orig= JSON.parse(extract(file_pdf, debug)); const info= s.cat(file_info).trim(); let isInside= false; for(const line_raw of info.split("\n")){ const line= line_raw.trim(); if(line.startsWith("")){ const title= line.slice(7).replace("", "").trim(); info_orig.Info.Title= title; continue; } if(line.startsWith("")){ break; } } const tmp_json= `${tmp}${tmpname(file_pdf)}.json`; s.echo(JSON.stringify(info_orig, null, "\t")).to(tmp_json); const out= infoFromJSON(tmp_json); if(!debug) s.rm(tmp_json); return out; } function infoFromJSON(file_info){ const info= s.cat(file_info).xargs(JSON.parse); const output= []; info.Bookmark= Object.entries(info.Bookmark) .map(/** @param {[string, string]} _ */([PageNumber, Title])=> { PageNumber= Number.parseInt(PageNumber); const level= Title.search(/[^ ]/); return { PageNumber, Title: Title.slice(level), Level: level+1, }; }); for(const [key, value] of Object.entries(info)){ if(Array.isArray(value)){ const records= value.flatMap(pipe( Object.entries, entries=> [key+"Begin"].concat(entries.map(([subkey, value])=> `${key}${subkey}: ${value}`)), )); output.push(...records); continue; } if(typeof value==="object"){ const records= Object.entries(value).flatMap(([subkey, value])=> [ key+"Begin", `${key}Key: ${subkey}`, `${key}Value: ${value}`, ]); output.push(...records); continue; } output.push(`${key}: ${value}`); } return output.join("\n"); } /** @returns {Record} */ function infoToJSON(file_info){ const output= new Map(); const data= s.cat(file_info).split("\n"); let line= 0; const isEnd= line=> line>=data.length; for(; line items.map(({ PageNumber, Title, Level }, i)=> ([PageNumber+"-"+i, " ".repeat(Number(Level)-1) + Title])), Object.fromEntries, )(output.get("Bookmark") || [])); return pipe( Object.fromEntries, o=> JSON.stringify(o, null, "\t"), )(output.entries()); }