⚡ refact and moves ~/bin to ~/.local/bin
This commit is contained in:
182
.local/bin/§pdftk-data.mjs
Executable file
182
.local/bin/§pdftk-data.mjs
Executable file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env nodejsscript
|
||||
/* jshint esversion: 11,-W097, -W040, module: true, node: true, expr: true, undef: true *//* global echo, $, pipe, s, fetch, cyclicLoop */
|
||||
const tmp= "tmp-"; //pdftk has issue reading from /tmp, so need to put it somewhere to the same folder :-(
|
||||
if(!s.which("pdftk")) $.error("pdftk not found");
|
||||
|
||||
$.api()
|
||||
.version("2025-08-05")
|
||||
.describe([
|
||||
"A small wrapper around 'pdftk' to extract data from pdf into the JSON.",
|
||||
"Or to update the PDF with data from JSON.",
|
||||
"Bookmars are converted to JSON object with key=PageNumber, value= Title (no. of spaces= level-1).",
|
||||
])
|
||||
.option("--debug", "Debug mode")
|
||||
.command("extract <file_pdf> [file_info]", "Extract data from PDF.")
|
||||
.action(function extractCMD(file_pdf, file_info, { debug }){
|
||||
if(!s.test("-f", file_pdf)) $.error("PDF File not found");
|
||||
if(!file_info) file_info= filename(file_pdf) + ".json";
|
||||
|
||||
const info= extract(file_pdf);
|
||||
s.echo(info).to(file_info);
|
||||
$.exit(0);
|
||||
})
|
||||
.command("update <file_pdf> [file_info]", "Update PDF with data from JSON.")
|
||||
.action(function update(file_pdf, file_info, { debug }){
|
||||
if(!s.test("-f", file_pdf)) $.error("PDF File not found");
|
||||
if(!file_info) file_info= filename(file_pdf) + ".json";
|
||||
if(!s.test("-f", file_info)) $.error("Info File not found");
|
||||
|
||||
const infoIsHtml= file_info.endsWith(".html");
|
||||
|
||||
const info= infoIsHtml ? infoFromHTML(file_info, file_pdf, debug) : infoFromJSON(file_info);
|
||||
const temp= `${tmp}${tmpname(file_pdf)}.info`;
|
||||
s.echo(info).to(temp);
|
||||
const tmp_pdf= `${tmp}${tmpname(file_pdf)}.pdf`;
|
||||
s.cp(file_pdf, tmp_pdf);
|
||||
s.run`pdftk ${tmp_pdf} update_info_utf8 ${temp} output ${file_pdf}`;
|
||||
if(!debug){
|
||||
s.rm(tmp_pdf);
|
||||
s.rm(temp);
|
||||
}
|
||||
$.exit(0);
|
||||
})
|
||||
.command("convert <file_info>", "Converts between JSON and raw text.")
|
||||
.action(function convert(file_info){
|
||||
if(!s.test("-f", file_info)) $.error("Info File not found");
|
||||
const ext= file_info.slice(file_info.lastIndexOf("."));
|
||||
const info= ext===".json" ? infoFromJSON(file_info) : infoToJSON(file_info);
|
||||
echo(info);
|
||||
$.exit(0);
|
||||
})
|
||||
.parse();
|
||||
|
||||
function extract(file_pdf, debug){
|
||||
const temp= `${tmp}${tmpname(file_pdf)}.info` ;
|
||||
s.run`pdftk ${file_pdf} dump_data_utf8 output ${temp}`;
|
||||
const out= infoToJSON(temp);
|
||||
if(!debug) s.rm(temp);
|
||||
return out;
|
||||
}
|
||||
function filename(path){ return path.slice(path.lastIndexOf("/")+1, path.lastIndexOf(".")); }
|
||||
function tmpname(path){ return filename(path) + "-" + Date.now(); }
|
||||
function infoFromHTML(file_info, file_pdf, debug){
|
||||
const info_orig= JSON.parse(extract(file_pdf, debug));
|
||||
const info= s.cat(file_info).trim();
|
||||
let isInside= false;
|
||||
for(const line_raw of info.split("\n")){
|
||||
const line= line_raw.trim();
|
||||
if(line.startsWith("<head")){
|
||||
isInside= true;
|
||||
continue;
|
||||
}
|
||||
if(!line || !isInside) continue;
|
||||
if(line.startsWith("<title>")){
|
||||
const title= line.slice(7).replace("</title>", "").trim();
|
||||
info_orig.Info.Title= title;
|
||||
continue;
|
||||
}
|
||||
if(line.startsWith("<meta") && line.includes("name=")){
|
||||
const [,, key]= line.match(/name=("|')(.*?)(\1)/);
|
||||
const [,, value]= line.match(/content=("|')(.*?)(\1)/);
|
||||
info_orig.Info[key[0].toUpperCase()+key.slice(1)]= value;
|
||||
}
|
||||
if(line.startsWith("</head>")){
|
||||
break;
|
||||
}
|
||||
}
|
||||
const tmp_json= `${tmp}${tmpname(file_pdf)}.json`;
|
||||
s.echo(JSON.stringify(info_orig, null, "\t")).to(tmp_json);
|
||||
const out= infoFromJSON(tmp_json);
|
||||
if(!debug) s.rm(tmp_json);
|
||||
return out;
|
||||
}
|
||||
function infoFromJSON(file_info){
|
||||
const info= s.cat(file_info).xargs(JSON.parse);
|
||||
const output= [];
|
||||
info.Bookmark= Object.entries(info.Bookmark)
|
||||
.map(/** @param {[string, string]} _ */([PageNumber, Title])=> {
|
||||
PageNumber= Number.parseInt(PageNumber);
|
||||
const level= Title.search(/[^ ]/);
|
||||
return {
|
||||
PageNumber,
|
||||
Title: Title.slice(level),
|
||||
Level: level+1,
|
||||
};
|
||||
});
|
||||
for(const [key, value] of Object.entries(info)){
|
||||
if(Array.isArray(value)){
|
||||
const records= value.flatMap(pipe(
|
||||
Object.entries,
|
||||
entries=> [key+"Begin"].concat(entries.map(([subkey, value])=> `${key}${subkey}: ${value}`)),
|
||||
));
|
||||
output.push(...records);
|
||||
continue;
|
||||
}
|
||||
if(typeof value==="object"){
|
||||
const records= Object.entries(value).flatMap(([subkey, value])=> [
|
||||
key+"Begin",
|
||||
`${key}Key: ${subkey}`,
|
||||
`${key}Value: ${value}`,
|
||||
]);
|
||||
output.push(...records);
|
||||
continue;
|
||||
}
|
||||
output.push(`${key}: ${value}`);
|
||||
}
|
||||
return output.join("\n");
|
||||
}
|
||||
/** @returns {Record<string, unknown>} */
|
||||
function infoToJSON(file_info){
|
||||
const output= new Map();
|
||||
const data= s.cat(file_info).split("\n");
|
||||
let line= 0;
|
||||
const isEnd= line=> line>=data.length;
|
||||
for(; line<data.length; line++){
|
||||
const content= data[line].trim();
|
||||
if(!content) continue;
|
||||
|
||||
if("InfoBegin"===content){
|
||||
const info= output.has("Info") ? output.get("Info") : {};
|
||||
const curr= [];
|
||||
while(++line){
|
||||
if(isEnd(line)) break;
|
||||
const content= data[line].trim();
|
||||
if(!content){ line++; continue; }
|
||||
const [key, value]= content.split(": ");
|
||||
const index= ["InfoKey", "InfoValue"].indexOf(key);
|
||||
if(index===-1){ line--; break; }
|
||||
curr[index]= value;
|
||||
}
|
||||
const [key, value]= curr;
|
||||
info[key]= value;
|
||||
output.set("Info", info);
|
||||
continue;
|
||||
}
|
||||
if(content.endsWith("Begin")){
|
||||
const name= content.replace("Begin", "");
|
||||
const output_curr= output.has(name) ? output.get(name) : [];
|
||||
const curr= {};
|
||||
while(++line){
|
||||
if(isEnd(line)) break;
|
||||
const content= data[line].trim();
|
||||
if(!content){ line++; continue; }
|
||||
const [key, value]= content.split(": ");
|
||||
if(!key.startsWith(name) || key === (name+"Begin")){ line--; break; }
|
||||
curr[key.replace(name, "")]= value;
|
||||
}
|
||||
output_curr.push(curr);
|
||||
output.set(name, output_curr);
|
||||
continue;
|
||||
}
|
||||
const [key, value]= content.split(": ");
|
||||
output.set(key, value);
|
||||
}
|
||||
output.set("Bookmark", pipe(
|
||||
items=> items.map(({ PageNumber, Title, Level }, i)=> ([PageNumber+"-"+i, " ".repeat(Number(Level)-1) + Title])),
|
||||
Object.fromEntries,
|
||||
)(output.get("Bookmark") || []));
|
||||
return pipe(
|
||||
Object.fromEntries,
|
||||
o=> JSON.stringify(o, null, "\t"),
|
||||
)(output.entries());
|
||||
}
|
Reference in New Issue
Block a user