first commit

This commit is contained in:
afoucaultc 2026-06-05 13:11:08 +02:00
commit 205faf4224
5471 changed files with 973850 additions and 0 deletions

View file

@ -0,0 +1,659 @@
#!/usr/bin/env python
# coding=utf-8
#
# Copyright (c) 2023 David Burghoff <burghoff@utexas.edu>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# for debugging parser
DEBUG_PARSER = True
DEBUG_PARSER = False
# for checking why elements aren't merging
DEBUG_MERGE = True
DEBUG_MERGE = False
NUM_SPACES = 1.0
# number of spaces beyond which text will be merged/split
XTOLEXT = 0.6
# x tolerance (number of spaces), let be big since there are
# kerning inaccuracies (as big as -0.56 in Whitney)
YTOLEXT = 0.1
# y tolerance (fraction of cap height), should be pretty small
XTOLMKN = 1.5
# left tolerance for manual kerning removal, used to be huge but is now tighter
# since differential kerning was made default for PDF
XTOLMKP = (
0.99
)
# right tolerance for manual kerning removal, should be fairly open-minded
YTOLMK = .01
XTOLSPLIT = 0.5
# tolerance for manual kerning splitting, should be fairly tight
SUBSUPER_THR = 0.99
# ensuring sub/superscripts are smaller helps reduce false merges
SUBSUPER_YTHR = 1 / 3
# superscripts must be at least 1/3 of the way above the baseline to merge
# (1/3 below cap for sub)
import inkex
import inkex.text.parser as tp
import os, sys, re
sys.path.append(
os.path.dirname(os.path.realpath(sys.argv[0]))
) # make sure my directory is on the path
import dhelpers as dh
def remove_kerning(
els,
removemanual,
mergesupersub,
splitdistant,
mergenearby,
justification=None,
debugparser=False,
):
tels = [el for el in els if isinstance(el, (inkex.TextElement, inkex.FlowRoot))]
if len(tels) > 0:
tels[0].croot.make_char_table(tels)
if DEBUG_PARSER or debugparser:
for el in tels:
el.parsed_text.make_highlights("char")
else:
# Do merges first (deciding based on original position)
tels = [el for el in els if isinstance(el, (inkex.TextElement,))]
ptl = tp.ParsedTextList(tels)
ptl.precalcs()
ptl.make_next_chain()
if removemanual:
for pt in ptl:
pt.differential_to_absolute_kerning()
pt.make_next_chain()
tels = Remove_Manual_Kerning(tels, mergesupersub)
if mergenearby or mergesupersub:
tels = External_Merges(tels, mergenearby, mergesupersub)
# # Then do splits (deciding based on current position, not original position,
# # since merges intentionally change position)
if splitdistant:
tels = Split_Distant_Chunks(tels)
if splitdistant:
tels = Split_Distant_Intrachunk(tels)
if splitdistant:
tels = Split_Lines(tels)
# # Final tweaks
tels = Change_Justification(tels, justification)
tels, removedspc = Remove_Trailing_Leading_Spaces(tels)
if removemanual or mergenearby or mergesupersub or removedspc:
tels = Fix_Merge_Positions(tels)
tels = Make_All_Editable(tels)
tels = Final_Cleanup(tels)
return dh.unique(els + tels)
def Final_Cleanup(els):
for el in els:
el.parsed_text.delete_empty()
return els
def Fix_Merge_Positions(els):
for el in els:
for line in el.parsed_text.lns:
for w in line.chks:
w.fix_merged_position()
return els
def Remove_Trailing_Leading_Spaces(els):
removed = False
for el in els:
if not (el.parsed_text.ismlinkscape) and not (
el.parsed_text.isflow
): # skip Inkscape-generated text
for line in el.parsed_text.lns:
mtxt = line.txt()
ii = len(mtxt) - 1
while ii >= 0 and mtxt[ii] == " ":
line.chrs[ii].delc()
ii -= 1
removed = True
mtxt = line.txt()
ii = 0
while ii < len(mtxt) and mtxt[ii] == " ":
line.chrs[0].delc()
ii += 1
removed = True
return els, removed
def Make_All_Editable(els):
for el in els:
el.parsed_text.make_editable()
return els
def Change_Justification(els, justification):
if justification is not None:
for ptxt in [el.parsed_text for el in els]:
if not (ptxt.ismlinkscape) and not (
ptxt.isflow
): # skip Inkscape-generated text
for line in ptxt.lns:
line.change_alignment(justification)
alignd = {"start": "start", "middle": "center", "end": "end"}
ptxt.textel.cstyle.__setitem__(
"text-anchor", justification, "text-align", alignd[justification]
)
return els
# Split different lines
def Split_Lines(els, ignoreinkscape=True):
ptxts = [el.parsed_text for el in els]
for jj in range(len(ptxts)):
ptxt = ptxts[jj]
if (
ptxt.lns is not None
and len(ptxt.lns) > 1
and (not (ptxt.ismlinkscape) or not (ignoreinkscape))
and not (ptxt.isflow)
):
for il in reversed(range(1, len(ptxt.lns))):
newtxt = ptxt.split_off_characters(ptxt.lns[il].chrs)
els.append(newtxt)
return els
# Generate splitting of distantly-kerned text
def Split_Distant_Chunks(els):
for ptxt in [el.parsed_text for el in els]:
if ptxt.lns is not None:
for il in reversed(range(len(ptxt.lns))):
line = ptxt.lns[il]
sws = [
x
for _, x in sorted(
zip([w.x for w in line.chks], line.chks),
key=lambda pair: pair[0],
)
] # chunks sorted in ascending x
splits = []
for ii in range(1, len(line.chks)):
w = sws[ii - 1]
w2 = sws[ii]
trl_spcs, ldg_spcs = trailing_leading(w.txt, w2.txt)
dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs)
xtol = XTOLSPLIT * w.spw
tr1, br1, tl2, bl2 = w.get_ut_pts(w2, current_pts=True)
if bl2[0] > br1[0] + dx + xtol:
splits.append(ii)
line.splits = splits
line.sws = sws
if len(splits) > 0:
for ii in reversed(range(len(splits))):
sstart = splits[ii]
if ii != len(splits) - 1:
sstop = splits[ii + 1]
else:
sstop = len(line.chks)
newtxt = ptxt.split_off_chunks(sws[sstart:sstop])
els.append(newtxt)
return els
# Generate splitting of distantly-kerned text
def Split_Distant_Intrachunk(els):
for ptxt in [el.parsed_text for el in els]:
if ptxt.lns is not None and not (ptxt.ismlinkscape) and not (ptxt.isflow):
for line in ptxt.lns:
for w in line.chks:
if len(w.chrs) > 0:
chrs = sorted(w.chrs, key=lambda chr: chr.pts_ut[0][0])
lastnspc = None
splitiis = []
prevsplit = 0
if chrs[0].c not in [" ", "\u00a0"]:
lastnspc = chrs[0]
for ii in range(1, len(chrs)):
if lastnspc is not None:
c = lastnspc
c2 = chrs[ii]
bl2 = c2.pts_ut[0]
br1 = c.pts_ut[3]
dx = w.spw * (NUM_SPACES)
xtol = XTOLSPLIT * w.spw
# If this character is splitting two numbers,
# should always split in case they are ticks
import re
remainingnumeric = False
numbersplits = [" ", "-", ""]
# chars that may separate numbers
splrest = re.split("|".join(numbersplits), w.txt[ii:])
splrest = [v for v in splrest if v != ""]
if len(splrest) > 0:
remainingnumeric = isnumeric(splrest[0])
numbersplit = (
isnumeric(w.txt[prevsplit:ii])
and (c2.c in numbersplits and remainingnumeric)
and c.loc.elem == c2.loc.elem
)
if bl2[0] > br1[0] + dx + xtol or numbersplit:
splitiis.append(ii)
prevsplit = ii
if chrs[ii].c not in [" ", "\u00a0"]:
lastnspc = chrs[ii]
if len(splitiis) > 0:
for ii in reversed(range(len(splitiis))):
sstart = splitiis[ii]
if ii != len(splitiis) - 1:
sstop = splitiis[ii + 1]
else:
sstop = len(chrs)
split_chrs = [chr for chr in w.chrs if chr in chrs[sstart:sstop]]
newtxt = ptxt.split_off_characters(split_chrs)
els.append(newtxt)
return els
def Remove_Manual_Kerning(els, mergesupersub):
# Generate list of merges
chks = []
ptxts = [el.parsed_text for el in els]
for ptxt in ptxts:
if ptxt.lns is not None:
chks += [w for line in ptxt.lns for w in line.chks]
for w in chks:
mw = []
w2 = w.nextw
if w2 is not None and w2 in chks and not (twospaces(w.txt, w2.txt)):
trl_spcs, ldg_spcs = trailing_leading(w.txt, w2.txt)
dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs)
xtoln = XTOLMKN * w.spw
xtolp = XTOLMKP * w.spw
ytol = YTOLMK * w.mch
try:
tr1, br1, tl2, bl2 = w.get_ut_pts(w2)
except ZeroDivisionError:
w.mw = mw
continue
if isnumeric(w.txt) and isnumeric(w2.txt, True):
dx = w.spw * 0
previoussp = w.txt == " " and w.prevw is not None
validmerge = br1[0] - xtoln <= bl2[0] <= br1[0] + dx + xtolp
validmerge = validmerge and br1[1] - ytol <= bl2[1] <= br1[1] + ytol
if previoussp and not validmerge:
# reconsider in case previous space was weirdly-kerned
tr1p, br1p, tl2p, bl2p = w.prevw.get_ut_pts(w2)
dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs + 1)
validmerge = br1p[0] - xtoln <= bl2p[0] <= br1p[0] + dx + xtolp
if validmerge:
mw.append([w2, "same", br1, bl2])
w.mw = mw
Perform_Merges(chks, mk=True)
# Following manual kerning removal, lines with multiple chunks
# need to be split out into new text els
newptxts = []
for ptxt in ptxts:
for line in ptxt.lns:
while len(line.chks) > 1:
newtxt = ptxt.split_off_chunks([line.chks[-1]])
els.append(newtxt)
newptxts.append(newtxt.parsed_text)
return els
import numpy as np
def External_Merges(els, mergenearby, mergesupersub):
# Generate list of merges
chks = []
for ptxt in [el.parsed_text for el in els]:
if ptxt.lns is not None:
chks += [w for line in ptxt.lns for w in line.chks]
pbbs = [None]*len(chks)
for ii, w in enumerate(chks):
cx = [v[0] for c in w.chrs for v in c.parsed_pts_t]
cy = [v[1] for c in w.chrs for v in c.parsed_pts_t]
pbbs[ii] = tp.bbox([min(cx),min(cy),max(cx)-min(cx),max(cy)-min(cy)]);
for ii, w in enumerate(chks):
dx = (
w.spw * w.scf * (NUM_SPACES + XTOLEXT)
) # a big bounding box that includes the extra space
w.bb_big = tp.bbox(
[
pbbs[ii].x1 - dx,
pbbs[ii].y1 - dx,
pbbs[ii].w + 2 * dx,
pbbs[ii].h + 2 * dx,
]
)
w.mw = []
# Vectorized angle / bbox calculations
angles = np.array([[w.angle for w in chks]])
sameangle = abs(angles - angles.T) < 0.001
bb1s = [w.bb_big for w in chks]
bb2s = pbbs
intersects = dh.bb_intersects(bb1s, bb2s)
# reshape(-1,1) is a transpose
potentials = np.logical_and(sameangle, intersects)
potentials = np.logical_and(
potentials, np.identity(len(chks)) == 0
) # off-diagonal only
goodl = np.argwhere(potentials)
for ii in range(goodl.shape[0]):
w = chks[goodl[ii, 0]]
w2 = chks[goodl[ii, 1]]
trl_spcs, ldg_spcs = trailing_leading(w.txt, w2.txt)
dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs)
xtol = XTOLEXT * w.spw
ytol = YTOLEXT * w.mch
# calculate 2's coords in 1's system
tr1, br1, tl2, bl2 = w.get_ut_pts(w2)
xpenmatch = br1[0] - xtol <= bl2[0] <= br1[0] + dx + xtol
neitherempty = len(wstrip(w.txt)) > 0 and len(wstrip(w2.txt)) > 0
if xpenmatch and neitherempty and not twospaces(w.txt, w2.txt):
weight_match = w.chrs[-1].tsty['font-weight'] == w2.chrs[0].tsty['font-weight']
# Don't sub/super merge when differences in font-weight
# Helps prevent accidental merges of subfigure label to tick
letterinpar = bool(re.fullmatch(r"^\([a-zA-Z]\)$", w.txt))
# Don't sub/super merge when is letter enclosed in parentheses
# Helps prevent accidental merges of subfigure label to tick
mtype = None
if (
abs(bl2[1] - br1[1]) < ytol
and abs(w.tfs - w2.tfs) < 0.001
and mergenearby
):
if isnumeric(w.line.txt()) and isnumeric(w2.line.txt(), True):
numsp = (bl2[0] - br1[0]) / (w.spw)
if abs(numsp) < 0.25:
# only merge numbers if very close (could be x ticks)
mtype = "same"
else:
mtype = "same"
elif (
br1[1] + ytol >= bl2[1] >= tr1[1] - ytol and mergesupersub and weight_match and not letterinpar
): # above baseline
aboveline = (
br1[1] * (1 - SUBSUPER_YTHR) + tr1[1] * SUBSUPER_YTHR + ytol
>= bl2[1]
)
if w2.tfs < w.tfs * SUBSUPER_THR: # new smaller, expect super
if aboveline:
mtype = "super"
elif w.tfs < w2.tfs * SUBSUPER_THR: # old smaller, expect reutrn
mtype = "subreturn"
elif SUBSUPER_THR == 1:
if aboveline:
if len(w2.line.txt()) > 2: # long text, probably not super
mtype = "subreturn"
else:
mtype = "superorsubreturn"
# could be either, decide later
else:
mtype = "subreturn"
elif br1[1] + ytol >= tl2[1] >= tr1[1] - ytol and mergesupersub and weight_match and not letterinpar:
belowline = (
tl2[1]
>= br1[1] * SUBSUPER_YTHR + tr1[1] * (1 - SUBSUPER_YTHR) - ytol
)
if w2.tfs < w.tfs * SUBSUPER_THR: # new smaller, expect sub
if belowline:
mtype = "sub"
elif w.tfs < w2.tfs * SUBSUPER_THR: # old smaller, expect superreturn
mtype = "superreturn"
elif SUBSUPER_THR == 1:
if belowline:
if len(w2.line.txt()) > 2: # long text, probably not sub
mtype = "superreturn"
else:
mtype = "suborsuperreturn"
# could be either, decide later
else:
mtype = "superreturn"
if mtype is not None:
w.mw.append([w2, mtype, br1, bl2])
# dh.debug(w.txt+' to '+w2.txt+' as '+mtype)
if DEBUG_MERGE:
dh.idebug('\nMerging "' + w.txt + '" and "' + w2.txt + '"')
if not (xpenmatch):
dh.idebug("Aborted, x pen too far: " + str([br1[0], bl2[0], dx]))
elif not (neitherempty):
dh.idebug("Aborted, one empty")
else:
if mtype is None:
if not (abs(bl2[1] - br1[1]) < ytol):
dh.idebug("Aborted, y pen too far: " + str([bl2[1], br1[1]]))
elif not (abs(w.tfs - w2.tfs) < 0.001):
dh.idebug(
"Aborted, fonts too different: " + str([w.tfs, w2.tfs])
)
elif not (
not (isnumeric(w.line.txt())) or not (isnumeric(w2.line.txt()))
):
dh.idebug("Aborted, both numbers")
else:
dh.idebug("Merged as " + mtype)
Perform_Merges(chks)
return els
def Perform_Merges(chks, mk=False):
for w in chks:
mw = w.mw
minx = float("inf")
for ii in range(len(mw)):
w2 = mw[ii][0]
mtype = mw[ii][1]
br1 = mw[ii][2]
bl2 = mw[ii][3]
if abs(bl2[0] - br1[0]) < minx:
minx = abs(bl2[0] - br1[0])
# starting pen best matches the stop of the previous one
mi = ii
w.merges = []
w.mergetypes = []
w.merged = False
if len(mw) > 0:
w2 = mw[mi][0]
mtype = mw[mi][1]
br1 = mw[mi][2]
bl2 = mw[mi][3]
w.merges = [w2]
w.mergetypes = [mtype]
# Generate chains of merges
for w in chks:
# if w.txt=='T':
if not (w.merged) and len(w.merges) > 0:
w.merges[-1].merged = True
nextmerge = w.merges[-1].merges
nextmerget = w.merges[-1].mergetypes
while len(nextmerge) > 0:
w.merges += nextmerge
w.mergetypes += nextmerget
w.merges[-1].merged = True
nextmerge = w.merges[-1].merges
nextmerget = w.merges[-1].mergetypes
# Create a merge plan
for w in chks:
if len(w.merges) > 0:
ctype = "normal"
w.wtypes = [ctype]
bail = False
for mt in w.mergetypes:
if ctype == "normal":
if mt == "same":
pass
elif mt == "sub":
ctype = "sub"
elif mt == "super":
ctype = "super"
elif mt == "suborsuperreturn":
ctype = "sub"
elif mt == "superorsubreturn":
ctype = "super"
elif all(
[t == "normal" for t in w.wtypes]
): # maybe started on sub/super
bail = True
else:
bail = True
elif ctype == "super":
if mt == "same":
pass
elif mt == "superreturn":
ctype = "normal"
elif mt == "suborsuperreturn":
ctype = "normal"
else:
bail = True
elif ctype == "sub":
if mt == "same":
pass
elif mt == "subreturn":
ctype = "normal"
elif mt == "superorsubreturn":
ctype = "normal"
else:
bail = True
w.wtypes.append(ctype)
if bail == True:
w.wtypes = []
w.merges = []
# Pre-merge position calculation
# Execute the merge plan
for w in chks:
if len(w.merges) > 0 and not (w.merged):
maxii = len(w.merges)
alltxt = "".join([w.txt] + [w2.txt for w2 in w.merges])
hasspaces = " " in alltxt
mels = []
for ii in range(maxii):
maxspaces = None
if mk and hasspaces and w.merges[ii].prevsametspan:
maxspaces = 0
if (
w.txt is not None and len(w.txt) > 0 and w.txt[-1] == " "
) or w.wtypes[ii + 1] in [
"super",
"sub",
]: # no extra spaces for sub/supers or if there's already one
maxspaces = 0
mels.append(w.merges[ii].line.ptxt.textel)
w.append_chk(w.merges[ii], w.wtypes[ii + 1], maxspaces)
# Union clips if necessary
mels = dh.unique([w.line.ptxt.textel] + mels)
if len(mels) > 1:
clips = [el.get_link("clip-path") for el in mels]
if any([c is None for c in clips]):
w.line.ptxt.textel.set("clip-path", None)
else:
# Duplicate main clip
dc = clips[0].duplicate()
wt = mels[0].ccomposed_transform
for ii in range(1, len(mels)):
# Duplicate merged clip, group contents, move to main dupe
dc2 = clips[ii].duplicate()
ng = dh.group(list(dc2))
dc.append(ng)
ng.ctransform = (-wt) @ mels[ii].ccomposed_transform
dc2.delete()
mels[0].set("clip-path", dc.get_id(2))
# Check if text represents a number
ncs = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "E", "-", "", ","]
def isnumeric(s, countminus=False):
s = (
s.strip().replace("", "-").replace(",", "")
) # strip whitespaces, replace minus signs with -, remove commas
if countminus and s == "-": # count a minus sign as a number
return True
try:
float(s)
return True
except ValueError:
return False
# Strip whitespaces
def wstrip(txt):
return txt.translate({ord(c): None for c in " \n\t\r"})
def twospaces(w1txt, w2txt):
if (
(w1txt is not None and len(w1txt) > 1 and w1txt[-2:] == " ")
or (
w1txt is not None
and len(w1txt) > 0
and w1txt[-1:] == " "
and w2txt is not None
and len(w2txt) > 0
and w2txt[0] == " "
)
or (w2txt is not None and len(w2txt) > 1 and w2txt[:1] == " ")
):
return True # resultant chunk has two spaces
return False
def trailing_leading(wtxt, w2txt):
trl_spcs = sum([all([c == " " for c in wtxt[ii:]]) for ii in range(len(wtxt))])
ldg_spcs = sum(
[all([c == " " for c in w2txt[: ii + 1]]) for ii in range(len(w2txt))]
)
return trl_spcs, ldg_spcs