Files
dotfiles/.local/bin/§pdftk-data.mjs

183 lines
5.9 KiB
JavaScript
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env nodejsscript
/* jshint esversion: 11,-W097, -W040, module: true, node: true, expr: true, undef: true *//* global echo, $, pipe, s, fetch, cyclicLoop */
const tmp= "tmp-"; //pdftk has issue reading from /tmp, so need to put it somewhere to the same folder :-(
if(!s.which("pdftk")) $.error("pdftk not found");
$.api()
.version("2025-08-05")
.describe([
"A small wrapper around 'pdftk' to extract data from pdf into the JSON.",
"Or to update the PDF with data from JSON.",
"Bookmars are converted to JSON object with key=PageNumber, value= Title (no. of spaces= level-1).",
])
.option("--debug", "Debug mode")
.command("extract <file_pdf> [file_info]", "Extract data from PDF.")
.action(function extractCMD(file_pdf, file_info, { debug }){
if(!s.test("-f", file_pdf)) $.error("PDF File not found");
if(!file_info) file_info= filename(file_pdf) + ".json";
const info= extract(file_pdf);
s.echo(info).to(file_info);
$.exit(0);
})
.command("update <file_pdf> [file_info]", "Update PDF with data from JSON.")
.action(function update(file_pdf, file_info, { debug }){
if(!s.test("-f", file_pdf)) $.error("PDF File not found");
if(!file_info) file_info= filename(file_pdf) + ".json";
if(!s.test("-f", file_info)) $.error("Info File not found");
const infoIsHtml= file_info.endsWith(".html");
const info= infoIsHtml ? infoFromHTML(file_info, file_pdf, debug) : infoFromJSON(file_info);
const temp= `${tmp}${tmpname(file_pdf)}.info`;
s.echo(info).to(temp);
const tmp_pdf= `${tmp}${tmpname(file_pdf)}.pdf`;
s.cp(file_pdf, tmp_pdf);
s.run`pdftk ${tmp_pdf} update_info_utf8 ${temp} output ${file_pdf}`;
if(!debug){
s.rm(tmp_pdf);
s.rm(temp);
}
$.exit(0);
})
.command("convert <file_info>", "Converts between JSON and raw text.")
.action(function convert(file_info){
if(!s.test("-f", file_info)) $.error("Info File not found");
const ext= file_info.slice(file_info.lastIndexOf("."));
const info= ext===".json" ? infoFromJSON(file_info) : infoToJSON(file_info);
echo(info);
$.exit(0);
})
.parse();
function extract(file_pdf, debug){
const temp= `${tmp}${tmpname(file_pdf)}.info` ;
s.run`pdftk ${file_pdf} dump_data_utf8 output ${temp}`;
const out= infoToJSON(temp);
if(!debug) s.rm(temp);
return out;
}
function filename(path){ return path.slice(path.lastIndexOf("/")+1, path.lastIndexOf(".")); }
function tmpname(path){ return filename(path) + "-" + Date.now(); }
function infoFromHTML(file_info, file_pdf, debug){
const info_orig= JSON.parse(extract(file_pdf, debug));
const info= s.cat(file_info).trim();
let isInside= false;
for(const line_raw of info.split("\n")){
const line= line_raw.trim();
if(line.startsWith("<head")){
isInside= true;
continue;
}
if(!line || !isInside) continue;
if(line.startsWith("<title>")){
const title= line.slice(7).replace("</title>", "").trim();
info_orig.Info.Title= title;
continue;
}
if(line.startsWith("<meta") && line.includes("name=")){
const [,, key]= line.match(/name=("|')(.*?)(\1)/);
const [,, value]= line.match(/content=("|')(.*?)(\1)/);
info_orig.Info[key[0].toUpperCase()+key.slice(1)]= value;
}
if(line.startsWith("</head>")){
break;
}
}
const tmp_json= `${tmp}${tmpname(file_pdf)}.json`;
s.echo(JSON.stringify(info_orig, null, "\t")).to(tmp_json);
const out= infoFromJSON(tmp_json);
if(!debug) s.rm(tmp_json);
return out;
}
function infoFromJSON(file_info){
const info= s.cat(file_info).xargs(JSON.parse);
const output= [];
info.Bookmark= Object.entries(info.Bookmark)
.map(/** @param {[string, string]} _ */([PageNumber, Title])=> {
PageNumber= Number.parseInt(PageNumber);
const level= Title.search(/[^ ]/);
return {
PageNumber,
Title: Title.slice(level),
Level: level+1,
};
});
for(const [key, value] of Object.entries(info)){
if(Array.isArray(value)){
const records= value.flatMap(pipe(
Object.entries,
entries=> [key+"Begin"].concat(entries.map(([subkey, value])=> `${key}${subkey}: ${value}`)),
));
output.push(...records);
continue;
}
if(typeof value==="object"){
const records= Object.entries(value).flatMap(([subkey, value])=> [
key+"Begin",
`${key}Key: ${subkey}`,
`${key}Value: ${value}`,
]);
output.push(...records);
continue;
}
output.push(`${key}: ${value}`);
}
return output.join("\n");
}
/** @returns {Record<string, unknown>} */
function infoToJSON(file_info){
const output= new Map();
const data= s.cat(file_info).split("\n");
let line= 0;
const isEnd= line=> line>=data.length;
for(; line<data.length; line++){
const content= data[line].trim();
if(!content) continue;
if("InfoBegin"===content){
const info= output.has("Info") ? output.get("Info") : {};
const curr= [];
while(++line){
if(isEnd(line)) break;
const content= data[line].trim();
if(!content){ line++; continue; }
const [key, value]= content.split(": ");
const index= ["InfoKey", "InfoValue"].indexOf(key);
if(index===-1){ line--; break; }
curr[index]= value;
}
const [key, value]= curr;
info[key]= value;
output.set("Info", info);
continue;
}
if(content.endsWith("Begin")){
const name= content.replace("Begin", "");
const output_curr= output.has(name) ? output.get(name) : [];
const curr= {};
while(++line){
if(isEnd(line)) break;
const content= data[line].trim();
if(!content){ line++; continue; }
const [key, value]= content.split(": ");
if(!key.startsWith(name) || key === (name+"Begin")){ line--; break; }
curr[key.replace(name, "")]= value;
}
output_curr.push(curr);
output.set(name, output_curr);
continue;
}
const [key, value]= content.split(": ");
output.set(key, value);
}
output.set("Bookmark", pipe(
items=> items.map(({ PageNumber, Title, Level }, i)=> ([PageNumber+"-"+i, " ".repeat(Number(Level)-1) + Title])),
Object.fromEntries,
)(output.get("Bookmark") || []));
return pipe(
Object.fromEntries,
o=> JSON.stringify(o, null, "\t"),
)(output.entries());
}