#!/usr/bin/env python # coding=utf-8 # # Copyright (c) 2023 David Burghoff # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # for debugging parser DEBUG_PARSER = True DEBUG_PARSER = False # for checking why elements aren't merging DEBUG_MERGE = True DEBUG_MERGE = False NUM_SPACES = 1.0 # number of spaces beyond which text will be merged/split XTOLEXT = 0.6 # x tolerance (number of spaces), let be big since there are # kerning inaccuracies (as big as -0.56 in Whitney) YTOLEXT = 0.1 # y tolerance (fraction of cap height), should be pretty small XTOLMKN = 1.5 # left tolerance for manual kerning removal, used to be huge but is now tighter # since differential kerning was made default for PDF XTOLMKP = ( 0.99 ) # right tolerance for manual kerning removal, should be fairly open-minded YTOLMK = .01 XTOLSPLIT = 0.5 # tolerance for manual kerning splitting, should be fairly tight SUBSUPER_THR = 0.99 # ensuring sub/superscripts are smaller helps reduce false merges SUBSUPER_YTHR = 1 / 3 # superscripts must be at least 1/3 of the way above the baseline to merge # (1/3 below cap for sub) import inkex import inkex.text.parser as tp import os, sys, re sys.path.append( os.path.dirname(os.path.realpath(sys.argv[0])) ) # make sure my directory is on the path import dhelpers as dh def remove_kerning( els, removemanual, mergesupersub, splitdistant, mergenearby, justification=None, debugparser=False, ): tels = [el for el in els if isinstance(el, (inkex.TextElement, inkex.FlowRoot))] if len(tels) > 0: tels[0].croot.make_char_table(tels) if DEBUG_PARSER or debugparser: for el in tels: el.parsed_text.make_highlights("char") else: # Do merges first (deciding based on original position) tels = [el for el in els if isinstance(el, (inkex.TextElement,))] ptl = tp.ParsedTextList(tels) ptl.precalcs() ptl.make_next_chain() if removemanual: for pt in ptl: pt.differential_to_absolute_kerning() pt.make_next_chain() tels = Remove_Manual_Kerning(tels, mergesupersub) if mergenearby or mergesupersub: tels = External_Merges(tels, mergenearby, mergesupersub) # # Then do splits (deciding based on current position, not original position, # # since merges intentionally change position) if splitdistant: tels = Split_Distant_Chunks(tels) if splitdistant: tels = Split_Distant_Intrachunk(tels) if splitdistant: tels = Split_Lines(tels) # # Final tweaks tels = Change_Justification(tels, justification) tels, removedspc = Remove_Trailing_Leading_Spaces(tels) if removemanual or mergenearby or mergesupersub or removedspc: tels = Fix_Merge_Positions(tels) tels = Make_All_Editable(tels) tels = Final_Cleanup(tels) return dh.unique(els + tels) def Final_Cleanup(els): for el in els: el.parsed_text.delete_empty() return els def Fix_Merge_Positions(els): for el in els: for line in el.parsed_text.lns: for w in line.chks: w.fix_merged_position() return els def Remove_Trailing_Leading_Spaces(els): removed = False for el in els: if not (el.parsed_text.ismlinkscape) and not ( el.parsed_text.isflow ): # skip Inkscape-generated text for line in el.parsed_text.lns: mtxt = line.txt() ii = len(mtxt) - 1 while ii >= 0 and mtxt[ii] == " ": line.chrs[ii].delc() ii -= 1 removed = True mtxt = line.txt() ii = 0 while ii < len(mtxt) and mtxt[ii] == " ": line.chrs[0].delc() ii += 1 removed = True return els, removed def Make_All_Editable(els): for el in els: el.parsed_text.make_editable() return els def Change_Justification(els, justification): if justification is not None: for ptxt in [el.parsed_text for el in els]: if not (ptxt.ismlinkscape) and not ( ptxt.isflow ): # skip Inkscape-generated text for line in ptxt.lns: line.change_alignment(justification) alignd = {"start": "start", "middle": "center", "end": "end"} ptxt.textel.cstyle.__setitem__( "text-anchor", justification, "text-align", alignd[justification] ) return els # Split different lines def Split_Lines(els, ignoreinkscape=True): ptxts = [el.parsed_text for el in els] for jj in range(len(ptxts)): ptxt = ptxts[jj] if ( ptxt.lns is not None and len(ptxt.lns) > 1 and (not (ptxt.ismlinkscape) or not (ignoreinkscape)) and not (ptxt.isflow) ): for il in reversed(range(1, len(ptxt.lns))): newtxt = ptxt.split_off_characters(ptxt.lns[il].chrs) els.append(newtxt) return els # Generate splitting of distantly-kerned text def Split_Distant_Chunks(els): for ptxt in [el.parsed_text for el in els]: if ptxt.lns is not None: for il in reversed(range(len(ptxt.lns))): line = ptxt.lns[il] sws = [ x for _, x in sorted( zip([w.x for w in line.chks], line.chks), key=lambda pair: pair[0], ) ] # chunks sorted in ascending x splits = [] for ii in range(1, len(line.chks)): w = sws[ii - 1] w2 = sws[ii] trl_spcs, ldg_spcs = trailing_leading(w.txt, w2.txt) dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs) xtol = XTOLSPLIT * w.spw tr1, br1, tl2, bl2 = w.get_ut_pts(w2, current_pts=True) if bl2[0] > br1[0] + dx + xtol: splits.append(ii) line.splits = splits line.sws = sws if len(splits) > 0: for ii in reversed(range(len(splits))): sstart = splits[ii] if ii != len(splits) - 1: sstop = splits[ii + 1] else: sstop = len(line.chks) newtxt = ptxt.split_off_chunks(sws[sstart:sstop]) els.append(newtxt) return els # Generate splitting of distantly-kerned text def Split_Distant_Intrachunk(els): for ptxt in [el.parsed_text for el in els]: if ptxt.lns is not None and not (ptxt.ismlinkscape) and not (ptxt.isflow): for line in ptxt.lns: for w in line.chks: if len(w.chrs) > 0: chrs = sorted(w.chrs, key=lambda chr: chr.pts_ut[0][0]) lastnspc = None splitiis = [] prevsplit = 0 if chrs[0].c not in [" ", "\u00a0"]: lastnspc = chrs[0] for ii in range(1, len(chrs)): if lastnspc is not None: c = lastnspc c2 = chrs[ii] bl2 = c2.pts_ut[0] br1 = c.pts_ut[3] dx = w.spw * (NUM_SPACES) xtol = XTOLSPLIT * w.spw # If this character is splitting two numbers, # should always split in case they are ticks import re remainingnumeric = False numbersplits = [" ", "-", "−"] # chars that may separate numbers splrest = re.split("|".join(numbersplits), w.txt[ii:]) splrest = [v for v in splrest if v != ""] if len(splrest) > 0: remainingnumeric = isnumeric(splrest[0]) numbersplit = ( isnumeric(w.txt[prevsplit:ii]) and (c2.c in numbersplits and remainingnumeric) and c.loc.elem == c2.loc.elem ) if bl2[0] > br1[0] + dx + xtol or numbersplit: splitiis.append(ii) prevsplit = ii if chrs[ii].c not in [" ", "\u00a0"]: lastnspc = chrs[ii] if len(splitiis) > 0: for ii in reversed(range(len(splitiis))): sstart = splitiis[ii] if ii != len(splitiis) - 1: sstop = splitiis[ii + 1] else: sstop = len(chrs) split_chrs = [chr for chr in w.chrs if chr in chrs[sstart:sstop]] newtxt = ptxt.split_off_characters(split_chrs) els.append(newtxt) return els def Remove_Manual_Kerning(els, mergesupersub): # Generate list of merges chks = [] ptxts = [el.parsed_text for el in els] for ptxt in ptxts: if ptxt.lns is not None: chks += [w for line in ptxt.lns for w in line.chks] for w in chks: mw = [] w2 = w.nextw if w2 is not None and w2 in chks and not (twospaces(w.txt, w2.txt)): trl_spcs, ldg_spcs = trailing_leading(w.txt, w2.txt) dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs) xtoln = XTOLMKN * w.spw xtolp = XTOLMKP * w.spw ytol = YTOLMK * w.mch try: tr1, br1, tl2, bl2 = w.get_ut_pts(w2) except ZeroDivisionError: w.mw = mw continue if isnumeric(w.txt) and isnumeric(w2.txt, True): dx = w.spw * 0 previoussp = w.txt == " " and w.prevw is not None validmerge = br1[0] - xtoln <= bl2[0] <= br1[0] + dx + xtolp validmerge = validmerge and br1[1] - ytol <= bl2[1] <= br1[1] + ytol if previoussp and not validmerge: # reconsider in case previous space was weirdly-kerned tr1p, br1p, tl2p, bl2p = w.prevw.get_ut_pts(w2) dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs + 1) validmerge = br1p[0] - xtoln <= bl2p[0] <= br1p[0] + dx + xtolp if validmerge: mw.append([w2, "same", br1, bl2]) w.mw = mw Perform_Merges(chks, mk=True) # Following manual kerning removal, lines with multiple chunks # need to be split out into new text els newptxts = [] for ptxt in ptxts: for line in ptxt.lns: while len(line.chks) > 1: newtxt = ptxt.split_off_chunks([line.chks[-1]]) els.append(newtxt) newptxts.append(newtxt.parsed_text) return els import numpy as np def External_Merges(els, mergenearby, mergesupersub): # Generate list of merges chks = [] for ptxt in [el.parsed_text for el in els]: if ptxt.lns is not None: chks += [w for line in ptxt.lns for w in line.chks] pbbs = [None]*len(chks) for ii, w in enumerate(chks): cx = [v[0] for c in w.chrs for v in c.parsed_pts_t] cy = [v[1] for c in w.chrs for v in c.parsed_pts_t] pbbs[ii] = tp.bbox([min(cx),min(cy),max(cx)-min(cx),max(cy)-min(cy)]); for ii, w in enumerate(chks): dx = ( w.spw * w.scf * (NUM_SPACES + XTOLEXT) ) # a big bounding box that includes the extra space w.bb_big = tp.bbox( [ pbbs[ii].x1 - dx, pbbs[ii].y1 - dx, pbbs[ii].w + 2 * dx, pbbs[ii].h + 2 * dx, ] ) w.mw = [] # Vectorized angle / bbox calculations angles = np.array([[w.angle for w in chks]]) sameangle = abs(angles - angles.T) < 0.001 bb1s = [w.bb_big for w in chks] bb2s = pbbs intersects = dh.bb_intersects(bb1s, bb2s) # reshape(-1,1) is a transpose potentials = np.logical_and(sameangle, intersects) potentials = np.logical_and( potentials, np.identity(len(chks)) == 0 ) # off-diagonal only goodl = np.argwhere(potentials) for ii in range(goodl.shape[0]): w = chks[goodl[ii, 0]] w2 = chks[goodl[ii, 1]] trl_spcs, ldg_spcs = trailing_leading(w.txt, w2.txt) dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs) xtol = XTOLEXT * w.spw ytol = YTOLEXT * w.mch # calculate 2's coords in 1's system tr1, br1, tl2, bl2 = w.get_ut_pts(w2) xpenmatch = br1[0] - xtol <= bl2[0] <= br1[0] + dx + xtol neitherempty = len(wstrip(w.txt)) > 0 and len(wstrip(w2.txt)) > 0 if xpenmatch and neitherempty and not twospaces(w.txt, w2.txt): weight_match = w.chrs[-1].tsty['font-weight'] == w2.chrs[0].tsty['font-weight'] # Don't sub/super merge when differences in font-weight # Helps prevent accidental merges of subfigure label to tick letterinpar = bool(re.fullmatch(r"^\([a-zA-Z]\)$", w.txt)) # Don't sub/super merge when is letter enclosed in parentheses # Helps prevent accidental merges of subfigure label to tick mtype = None if ( abs(bl2[1] - br1[1]) < ytol and abs(w.tfs - w2.tfs) < 0.001 and mergenearby ): if isnumeric(w.line.txt()) and isnumeric(w2.line.txt(), True): numsp = (bl2[0] - br1[0]) / (w.spw) if abs(numsp) < 0.25: # only merge numbers if very close (could be x ticks) mtype = "same" else: mtype = "same" elif ( br1[1] + ytol >= bl2[1] >= tr1[1] - ytol and mergesupersub and weight_match and not letterinpar ): # above baseline aboveline = ( br1[1] * (1 - SUBSUPER_YTHR) + tr1[1] * SUBSUPER_YTHR + ytol >= bl2[1] ) if w2.tfs < w.tfs * SUBSUPER_THR: # new smaller, expect super if aboveline: mtype = "super" elif w.tfs < w2.tfs * SUBSUPER_THR: # old smaller, expect reutrn mtype = "subreturn" elif SUBSUPER_THR == 1: if aboveline: if len(w2.line.txt()) > 2: # long text, probably not super mtype = "subreturn" else: mtype = "superorsubreturn" # could be either, decide later else: mtype = "subreturn" elif br1[1] + ytol >= tl2[1] >= tr1[1] - ytol and mergesupersub and weight_match and not letterinpar: belowline = ( tl2[1] >= br1[1] * SUBSUPER_YTHR + tr1[1] * (1 - SUBSUPER_YTHR) - ytol ) if w2.tfs < w.tfs * SUBSUPER_THR: # new smaller, expect sub if belowline: mtype = "sub" elif w.tfs < w2.tfs * SUBSUPER_THR: # old smaller, expect superreturn mtype = "superreturn" elif SUBSUPER_THR == 1: if belowline: if len(w2.line.txt()) > 2: # long text, probably not sub mtype = "superreturn" else: mtype = "suborsuperreturn" # could be either, decide later else: mtype = "superreturn" if mtype is not None: w.mw.append([w2, mtype, br1, bl2]) # dh.debug(w.txt+' to '+w2.txt+' as '+mtype) if DEBUG_MERGE: dh.idebug('\nMerging "' + w.txt + '" and "' + w2.txt + '"') if not (xpenmatch): dh.idebug("Aborted, x pen too far: " + str([br1[0], bl2[0], dx])) elif not (neitherempty): dh.idebug("Aborted, one empty") else: if mtype is None: if not (abs(bl2[1] - br1[1]) < ytol): dh.idebug("Aborted, y pen too far: " + str([bl2[1], br1[1]])) elif not (abs(w.tfs - w2.tfs) < 0.001): dh.idebug( "Aborted, fonts too different: " + str([w.tfs, w2.tfs]) ) elif not ( not (isnumeric(w.line.txt())) or not (isnumeric(w2.line.txt())) ): dh.idebug("Aborted, both numbers") else: dh.idebug("Merged as " + mtype) Perform_Merges(chks) return els def Perform_Merges(chks, mk=False): for w in chks: mw = w.mw minx = float("inf") for ii in range(len(mw)): w2 = mw[ii][0] mtype = mw[ii][1] br1 = mw[ii][2] bl2 = mw[ii][3] if abs(bl2[0] - br1[0]) < minx: minx = abs(bl2[0] - br1[0]) # starting pen best matches the stop of the previous one mi = ii w.merges = [] w.mergetypes = [] w.merged = False if len(mw) > 0: w2 = mw[mi][0] mtype = mw[mi][1] br1 = mw[mi][2] bl2 = mw[mi][3] w.merges = [w2] w.mergetypes = [mtype] # Generate chains of merges for w in chks: # if w.txt=='T': if not (w.merged) and len(w.merges) > 0: w.merges[-1].merged = True nextmerge = w.merges[-1].merges nextmerget = w.merges[-1].mergetypes while len(nextmerge) > 0: w.merges += nextmerge w.mergetypes += nextmerget w.merges[-1].merged = True nextmerge = w.merges[-1].merges nextmerget = w.merges[-1].mergetypes # Create a merge plan for w in chks: if len(w.merges) > 0: ctype = "normal" w.wtypes = [ctype] bail = False for mt in w.mergetypes: if ctype == "normal": if mt == "same": pass elif mt == "sub": ctype = "sub" elif mt == "super": ctype = "super" elif mt == "suborsuperreturn": ctype = "sub" elif mt == "superorsubreturn": ctype = "super" elif all( [t == "normal" for t in w.wtypes] ): # maybe started on sub/super bail = True else: bail = True elif ctype == "super": if mt == "same": pass elif mt == "superreturn": ctype = "normal" elif mt == "suborsuperreturn": ctype = "normal" else: bail = True elif ctype == "sub": if mt == "same": pass elif mt == "subreturn": ctype = "normal" elif mt == "superorsubreturn": ctype = "normal" else: bail = True w.wtypes.append(ctype) if bail == True: w.wtypes = [] w.merges = [] # Pre-merge position calculation # Execute the merge plan for w in chks: if len(w.merges) > 0 and not (w.merged): maxii = len(w.merges) alltxt = "".join([w.txt] + [w2.txt for w2 in w.merges]) hasspaces = " " in alltxt mels = [] for ii in range(maxii): maxspaces = None if mk and hasspaces and w.merges[ii].prevsametspan: maxspaces = 0 if ( w.txt is not None and len(w.txt) > 0 and w.txt[-1] == " " ) or w.wtypes[ii + 1] in [ "super", "sub", ]: # no extra spaces for sub/supers or if there's already one maxspaces = 0 mels.append(w.merges[ii].line.ptxt.textel) w.append_chk(w.merges[ii], w.wtypes[ii + 1], maxspaces) # Union clips if necessary mels = dh.unique([w.line.ptxt.textel] + mels) if len(mels) > 1: clips = [el.get_link("clip-path") for el in mels] if any([c is None for c in clips]): w.line.ptxt.textel.set("clip-path", None) else: # Duplicate main clip dc = clips[0].duplicate() wt = mels[0].ccomposed_transform for ii in range(1, len(mels)): # Duplicate merged clip, group contents, move to main dupe dc2 = clips[ii].duplicate() ng = dh.group(list(dc2)) dc.append(ng) ng.ctransform = (-wt) @ mels[ii].ccomposed_transform dc2.delete() mels[0].set("clip-path", dc.get_id(2)) # Check if text represents a number ncs = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "E", "-", "−", ","] def isnumeric(s, countminus=False): s = ( s.strip().replace("−", "-").replace(",", "") ) # strip whitespaces, replace minus signs with -, remove commas if countminus and s == "-": # count a minus sign as a number return True try: float(s) return True except ValueError: return False # Strip whitespaces def wstrip(txt): return txt.translate({ord(c): None for c in " \n\t\r"}) def twospaces(w1txt, w2txt): if ( (w1txt is not None and len(w1txt) > 1 and w1txt[-2:] == " ") or ( w1txt is not None and len(w1txt) > 0 and w1txt[-1:] == " " and w2txt is not None and len(w2txt) > 0 and w2txt[0] == " " ) or (w2txt is not None and len(w2txt) > 1 and w2txt[:1] == " ") ): return True # resultant chunk has two spaces return False def trailing_leading(wtxt, w2txt): trl_spcs = sum([all([c == " " for c in wtxt[ii:]]) for ii in range(len(wtxt))]) ldg_spcs = sum( [all([c == " " for c in w2txt[: ii + 1]]) for ii in range(len(w2txt))] ) return trl_spcs, ldg_spcs