dots/config/inkscape/extensions/org.inkscape.extension.30306/scientific_inkscape/remove_kerning.py

#!/usr/bin/env python
# coding=utf-8
#
# Copyright (c) 2023 David Burghoff <burghoff@utexas.edu>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# for debugging parser
DEBUG_PARSER = True
DEBUG_PARSER = False

# for checking why elements aren't merging
DEBUG_MERGE = True
DEBUG_MERGE = False

NUM_SPACES = 1.0
# number of spaces beyond which text will be merged/split
XTOLEXT = 0.6
# x tolerance (number of spaces), let be big since there are
# kerning inaccuracies (as big as -0.56 in Whitney)
YTOLEXT = 0.1
# y tolerance (fraction of cap height), should be pretty small
XTOLMKN = 1.5
# left tolerance for manual kerning removal, used to be huge but is now tighter
# since differential kerning was made default for PDF
XTOLMKP = (
    0.99
)
# right tolerance for manual kerning removal, should be fairly open-minded
YTOLMK = .01
XTOLSPLIT = 0.5
# tolerance for manual kerning splitting, should be fairly tight
SUBSUPER_THR = 0.99
# ensuring sub/superscripts are smaller helps reduce false merges
SUBSUPER_YTHR = 1 / 3
# superscripts must be at least 1/3 of the way above the baseline to merge
# (1/3 below cap for sub)

import inkex
import inkex.text.parser as tp

import os, sys, re

sys.path.append(
    os.path.dirname(os.path.realpath(sys.argv[0]))
)  # make sure my directory is on the path
import dhelpers as dh


def remove_kerning(
    els,
    removemanual,
    mergesupersub,
    splitdistant,
    mergenearby,
    justification=None,
    debugparser=False,
):
    tels = [el for el in els if isinstance(el, (inkex.TextElement, inkex.FlowRoot))]
    if len(tels) > 0:
        tels[0].croot.make_char_table(tels)
    if DEBUG_PARSER or debugparser:
        for el in tels:
            el.parsed_text.make_highlights("char")
    else:
        # Do merges first (deciding based on original position)
        tels = [el for el in els if isinstance(el, (inkex.TextElement,))]
        ptl = tp.ParsedTextList(tels)
        ptl.precalcs()
        ptl.make_next_chain()
        if removemanual:
            for pt in ptl:
                pt.differential_to_absolute_kerning()
                pt.make_next_chain()
            tels = Remove_Manual_Kerning(tels, mergesupersub)
        if mergenearby or mergesupersub:
            tels = External_Merges(tels, mergenearby, mergesupersub)
        # # Then do splits (deciding based on current position, not original position,
        # # since merges intentionally change position)
        if splitdistant:
            tels = Split_Distant_Chunks(tels)
        if splitdistant:
            tels = Split_Distant_Intrachunk(tels)
        if splitdistant:
            tels = Split_Lines(tels)
        # # Final tweaks
        tels = Change_Justification(tels, justification)
        tels, removedspc = Remove_Trailing_Leading_Spaces(tels)
        if removemanual or mergenearby or mergesupersub or removedspc:
            tels = Fix_Merge_Positions(tels)
        tels = Make_All_Editable(tels)
        tels = Final_Cleanup(tels)
    return dh.unique(els + tels)


def Final_Cleanup(els):
    for el in els:
        el.parsed_text.delete_empty()
    return els


def Fix_Merge_Positions(els):
    for el in els:
        for line in el.parsed_text.lns:
            for w in line.chks:
                w.fix_merged_position()
    return els


def Remove_Trailing_Leading_Spaces(els):
    removed = False
    for el in els:
        if not (el.parsed_text.ismlinkscape) and not (
            el.parsed_text.isflow
        ):  # skip Inkscape-generated text
            for line in el.parsed_text.lns:
                mtxt = line.txt()
                ii = len(mtxt) - 1
                while ii >= 0 and mtxt[ii] == " ":
                    line.chrs[ii].delc()
                    ii -= 1
                    removed = True

                mtxt = line.txt()
                ii = 0
                while ii < len(mtxt) and mtxt[ii] == " ":
                    line.chrs[0].delc()
                    ii += 1
                    removed = True
    return els, removed


def Make_All_Editable(els):
    for el in els:
        el.parsed_text.make_editable()
    return els

def Change_Justification(els, justification):
    if justification is not None:
        for ptxt in [el.parsed_text for el in els]:
            if not (ptxt.ismlinkscape) and not (
                ptxt.isflow
            ):  # skip Inkscape-generated text
                for line in ptxt.lns:
                    line.change_alignment(justification)
                alignd = {"start": "start", "middle": "center", "end": "end"}
                ptxt.textel.cstyle.__setitem__(
                    "text-anchor", justification, "text-align", alignd[justification]
                )
    return els


# Split different lines
def Split_Lines(els, ignoreinkscape=True):
    ptxts = [el.parsed_text for el in els]
    for jj in range(len(ptxts)):
        ptxt = ptxts[jj]
        if (
            ptxt.lns is not None
            and len(ptxt.lns) > 1
            and (not (ptxt.ismlinkscape) or not (ignoreinkscape))
            and not (ptxt.isflow)
        ):
            for il in reversed(range(1, len(ptxt.lns))):
                newtxt = ptxt.split_off_characters(ptxt.lns[il].chrs)
                els.append(newtxt)
    return els


# Generate splitting of distantly-kerned text
def Split_Distant_Chunks(els):
    for ptxt in [el.parsed_text for el in els]:
        if ptxt.lns is not None:
            for il in reversed(range(len(ptxt.lns))):
                line = ptxt.lns[il]
                sws = [
                    x
                    for _, x in sorted(
                        zip([w.x for w in line.chks], line.chks),
                        key=lambda pair: pair[0],
                    )
                ]  # chunks sorted in ascending x
                splits = []
                for ii in range(1, len(line.chks)):
                    w = sws[ii - 1]
                    w2 = sws[ii]

                    trl_spcs, ldg_spcs = trailing_leading(w.txt, w2.txt)
                    dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs)
                    xtol = XTOLSPLIT * w.spw

                    tr1, br1, tl2, bl2 = w.get_ut_pts(w2, current_pts=True)
                    if bl2[0] > br1[0] + dx + xtol:
                        splits.append(ii)
                line.splits = splits
                line.sws = sws

                if len(splits) > 0:
                    for ii in reversed(range(len(splits))):
                        sstart = splits[ii]
                        if ii != len(splits) - 1:
                            sstop = splits[ii + 1]
                        else:
                            sstop = len(line.chks)

                        newtxt = ptxt.split_off_chunks(sws[sstart:sstop])
                        els.append(newtxt)
    return els


# Generate splitting of distantly-kerned text
def Split_Distant_Intrachunk(els):
    for ptxt in [el.parsed_text for el in els]:
        if ptxt.lns is not None and not (ptxt.ismlinkscape) and not (ptxt.isflow):
            for line in ptxt.lns:
                for w in line.chks:
                    if len(w.chrs) > 0:
                        chrs = sorted(w.chrs, key=lambda chr: chr.pts_ut[0][0])
                        lastnspc = None
                        splitiis = []
                        prevsplit = 0
                        if chrs[0].c not in [" ", "\u00a0"]:
                            lastnspc = chrs[0]
                        for ii in range(1, len(chrs)):
                            if lastnspc is not None:
                                c = lastnspc
                                c2 = chrs[ii]

                                bl2 = c2.pts_ut[0]
                                br1 = c.pts_ut[3]

                                dx = w.spw * (NUM_SPACES)
                                xtol = XTOLSPLIT * w.spw

                                # If this character is splitting two numbers,
                                # should always split in case they are ticks
                                import re

                                remainingnumeric = False
                                numbersplits = [" ", "-", "−"]
                                # chars that may separate numbers
                                splrest = re.split("|".join(numbersplits), w.txt[ii:])
                                splrest = [v for v in splrest if v != ""]
                                if len(splrest) > 0:
                                    remainingnumeric = isnumeric(splrest[0])
                                numbersplit = (
                                    isnumeric(w.txt[prevsplit:ii])
                                    and (c2.c in numbersplits and remainingnumeric)
                                    and c.loc.elem == c2.loc.elem
                                )

                                if bl2[0] > br1[0] + dx + xtol or numbersplit:
                                    splitiis.append(ii)
                                    prevsplit = ii
                            if chrs[ii].c not in [" ", "\u00a0"]:
                                lastnspc = chrs[ii]

                        if len(splitiis) > 0:
                            for ii in reversed(range(len(splitiis))):
                                sstart = splitiis[ii]
                                if ii != len(splitiis) - 1:
                                    sstop = splitiis[ii + 1]
                                else:
                                    sstop = len(chrs)
                                split_chrs = [chr for chr in w.chrs if chr in chrs[sstart:sstop]]
                                newtxt = ptxt.split_off_characters(split_chrs)
                                els.append(newtxt)
    return els


def Remove_Manual_Kerning(els, mergesupersub):
    # Generate list of merges
    chks = []
    ptxts = [el.parsed_text for el in els]
    for ptxt in ptxts:
        if ptxt.lns is not None:
            chks += [w for line in ptxt.lns for w in line.chks]
    for w in chks:
        mw = []
        w2 = w.nextw
        if w2 is not None and w2 in chks and not (twospaces(w.txt, w2.txt)):
            trl_spcs, ldg_spcs = trailing_leading(w.txt, w2.txt)
            dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs)
            xtoln = XTOLMKN * w.spw
            xtolp = XTOLMKP * w.spw
            ytol  = YTOLMK  * w.mch

            try:
                tr1, br1, tl2, bl2 = w.get_ut_pts(w2)
            except ZeroDivisionError:
                w.mw = mw
                continue

            if isnumeric(w.txt) and isnumeric(w2.txt, True):
                dx = w.spw * 0

            previoussp = w.txt == " " and w.prevw is not None
            validmerge = br1[0] - xtoln <= bl2[0] <= br1[0] + dx + xtolp
            validmerge = validmerge and br1[1] - ytol <= bl2[1] <= br1[1] + ytol
            if previoussp and not validmerge:
                # reconsider in case previous space was weirdly-kerned
                tr1p, br1p, tl2p, bl2p = w.prevw.get_ut_pts(w2)
                dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs + 1)
                validmerge = br1p[0] - xtoln <= bl2p[0] <= br1p[0] + dx + xtolp

            if validmerge:
                mw.append([w2, "same", br1, bl2])

        w.mw = mw

    Perform_Merges(chks, mk=True)

    # Following manual kerning removal, lines with multiple chunks
    # need to be split out into new text els
    newptxts = []
    for ptxt in ptxts:
        for line in ptxt.lns:
            while len(line.chks) > 1:
                newtxt = ptxt.split_off_chunks([line.chks[-1]])
                els.append(newtxt)
                newptxts.append(newtxt.parsed_text)
    return els


import numpy as np


def External_Merges(els, mergenearby, mergesupersub):
    # Generate list of merges
    chks = []
    for ptxt in [el.parsed_text for el in els]:
        if ptxt.lns is not None:
            chks += [w for line in ptxt.lns for w in line.chks]

    pbbs = [None]*len(chks)
    for ii, w in enumerate(chks):
        cx = [v[0] for c in w.chrs for v in c.parsed_pts_t]
        cy = [v[1] for c in w.chrs for v in c.parsed_pts_t]
        pbbs[ii] = tp.bbox([min(cx),min(cy),max(cx)-min(cx),max(cy)-min(cy)]);

    for ii, w in enumerate(chks):
        dx = (
            w.spw * w.scf * (NUM_SPACES + XTOLEXT)
        )  # a big bounding box that includes the extra space

        w.bb_big = tp.bbox(
            [
                pbbs[ii].x1 - dx,
                pbbs[ii].y1 - dx,
                pbbs[ii].w + 2 * dx,
                pbbs[ii].h + 2 * dx,
            ]
        )
        w.mw = []

    # Vectorized angle / bbox calculations
    angles = np.array([[w.angle for w in chks]])
    sameangle = abs(angles - angles.T) < 0.001

    bb1s = [w.bb_big for w in chks]
    bb2s = pbbs
    intersects = dh.bb_intersects(bb1s, bb2s)

    # reshape(-1,1) is a transpose
    potentials = np.logical_and(sameangle, intersects)
    potentials = np.logical_and(
        potentials, np.identity(len(chks)) == 0
    )  # off-diagonal only
    goodl = np.argwhere(potentials)

    for ii in range(goodl.shape[0]):
        w = chks[goodl[ii, 0]]
        w2 = chks[goodl[ii, 1]]
        trl_spcs, ldg_spcs = trailing_leading(w.txt, w2.txt)

        dx = w.spw * (NUM_SPACES - trl_spcs - ldg_spcs)
        xtol = XTOLEXT * w.spw
        ytol = YTOLEXT * w.mch

        # calculate 2's coords in 1's system
        tr1, br1, tl2, bl2 = w.get_ut_pts(w2)
        xpenmatch = br1[0] - xtol <= bl2[0] <= br1[0] + dx + xtol
        neitherempty = len(wstrip(w.txt)) > 0 and len(wstrip(w2.txt)) > 0
        if xpenmatch and neitherempty and not twospaces(w.txt, w2.txt):
            weight_match = w.chrs[-1].tsty['font-weight'] == w2.chrs[0].tsty['font-weight']
            # Don't sub/super merge when differences in font-weight
            # Helps prevent accidental merges of subfigure label to tick
            letterinpar = bool(re.fullmatch(r"^\([a-zA-Z]\)$", w.txt))
            # Don't sub/super merge when is letter enclosed in parentheses
            # Helps prevent accidental merges of subfigure label to tick
            mtype = None
            if (
                abs(bl2[1] - br1[1]) < ytol
                and abs(w.tfs - w2.tfs) < 0.001
                and mergenearby
            ):
                if isnumeric(w.line.txt()) and isnumeric(w2.line.txt(), True):
                    numsp = (bl2[0] - br1[0]) / (w.spw)
                    if abs(numsp) < 0.25:
                        # only merge numbers if very close (could be x ticks)
                        mtype = "same"
                else:
                    mtype = "same"
            elif (
                br1[1] + ytol >= bl2[1] >= tr1[1] - ytol and mergesupersub and weight_match and not letterinpar
            ):  # above baseline
                aboveline = (
                    br1[1] * (1 - SUBSUPER_YTHR) + tr1[1] * SUBSUPER_YTHR + ytol
                    >= bl2[1]
                )

                if w2.tfs < w.tfs * SUBSUPER_THR:  # new smaller, expect super
                    if aboveline:
                        mtype = "super"
                elif w.tfs < w2.tfs * SUBSUPER_THR:  # old smaller, expect reutrn
                    mtype = "subreturn"
                elif SUBSUPER_THR == 1:
                    if aboveline:
                        if len(w2.line.txt()) > 2:  # long text, probably not super
                            mtype = "subreturn"
                        else:
                            mtype = "superorsubreturn"
                            # could be either, decide later
                    else:
                        mtype = "subreturn"
            elif br1[1] + ytol >= tl2[1] >= tr1[1] - ytol and mergesupersub and weight_match and not letterinpar:
                belowline = (
                    tl2[1]
                    >= br1[1] * SUBSUPER_YTHR + tr1[1] * (1 - SUBSUPER_YTHR) - ytol
                )
                if w2.tfs < w.tfs * SUBSUPER_THR:  # new smaller, expect sub
                    if belowline:
                        mtype = "sub"
                elif w.tfs < w2.tfs * SUBSUPER_THR:  # old smaller, expect superreturn
                    mtype = "superreturn"
                elif SUBSUPER_THR == 1:
                    if belowline:
                        if len(w2.line.txt()) > 2:  # long text, probably not sub
                            mtype = "superreturn"
                        else:
                            mtype = "suborsuperreturn"
                            # could be either, decide later
                    else:
                        mtype = "superreturn"
            if mtype is not None:
                w.mw.append([w2, mtype, br1, bl2])
        #                            dh.debug(w.txt+' to '+w2.txt+' as '+mtype)

        if DEBUG_MERGE:
            dh.idebug('\nMerging "' + w.txt + '" and "' + w2.txt + '"')
            if not (xpenmatch):
                dh.idebug("Aborted, x pen too far: " + str([br1[0], bl2[0], dx]))
            elif not (neitherempty):
                dh.idebug("Aborted, one empty")
            else:
                if mtype is None:
                    if not (abs(bl2[1] - br1[1]) < ytol):
                        dh.idebug("Aborted, y pen too far: " + str([bl2[1], br1[1]]))
                    elif not (abs(w.tfs - w2.tfs) < 0.001):
                        dh.idebug(
                            "Aborted, fonts too different: " + str([w.tfs, w2.tfs])
                        )
                    elif not (
                        not (isnumeric(w.line.txt())) or not (isnumeric(w2.line.txt()))
                    ):
                        dh.idebug("Aborted, both numbers")
                else:
                    dh.idebug("Merged as " + mtype)

    Perform_Merges(chks)
    return els


def Perform_Merges(chks, mk=False):
    for w in chks:
        mw = w.mw
        minx = float("inf")
        for ii in range(len(mw)):
            w2 = mw[ii][0]
            mtype = mw[ii][1]
            br1 = mw[ii][2]
            bl2 = mw[ii][3]
            if abs(bl2[0] - br1[0]) < minx:
                minx = abs(bl2[0] - br1[0])
                # starting pen best matches the stop of the previous one
                mi = ii
        w.merges = []
        w.mergetypes = []
        w.merged = False
        if len(mw) > 0:
            w2 = mw[mi][0]
            mtype = mw[mi][1]
            br1 = mw[mi][2]
            bl2 = mw[mi][3]
            w.merges = [w2]
            w.mergetypes = [mtype]

    # Generate chains of merges
    for w in chks:
        # if w.txt=='T':
        if not (w.merged) and len(w.merges) > 0:
            w.merges[-1].merged = True
            nextmerge = w.merges[-1].merges
            nextmerget = w.merges[-1].mergetypes
            while len(nextmerge) > 0:
                w.merges += nextmerge
                w.mergetypes += nextmerget
                w.merges[-1].merged = True
                nextmerge = w.merges[-1].merges
                nextmerget = w.merges[-1].mergetypes

    # Create a merge plan
    for w in chks:
        if len(w.merges) > 0:
            ctype = "normal"
            w.wtypes = [ctype]
            bail = False
            for mt in w.mergetypes:
                if ctype == "normal":
                    if mt == "same":
                        pass
                    elif mt == "sub":
                        ctype = "sub"
                    elif mt == "super":
                        ctype = "super"
                    elif mt == "suborsuperreturn":
                        ctype = "sub"
                    elif mt == "superorsubreturn":
                        ctype = "super"
                    elif all(
                        [t == "normal" for t in w.wtypes]
                    ):  # maybe started on sub/super
                        bail = True
                    else:
                        bail = True
                elif ctype == "super":
                    if mt == "same":
                        pass
                    elif mt == "superreturn":
                        ctype = "normal"
                    elif mt == "suborsuperreturn":
                        ctype = "normal"
                    else:
                        bail = True
                elif ctype == "sub":
                    if mt == "same":
                        pass
                    elif mt == "subreturn":
                        ctype = "normal"
                    elif mt == "superorsubreturn":
                        ctype = "normal"
                    else:
                        bail = True
                w.wtypes.append(ctype)
            if bail == True:
                w.wtypes = []
                w.merges = []
    # Pre-merge position calculation

    # Execute the merge plan
    for w in chks:
        if len(w.merges) > 0 and not (w.merged):
            maxii = len(w.merges)
            alltxt = "".join([w.txt] + [w2.txt for w2 in w.merges])
            hasspaces = " " in alltxt

            mels = []
            for ii in range(maxii):
                maxspaces = None
                if mk and hasspaces and w.merges[ii].prevsametspan:
                    maxspaces = 0
                if (
                    w.txt is not None and len(w.txt) > 0 and w.txt[-1] == " "
                ) or w.wtypes[ii + 1] in [
                    "super",
                    "sub",
                ]:  # no extra spaces for sub/supers or if there's already one
                    maxspaces = 0

                mels.append(w.merges[ii].line.ptxt.textel)
                w.append_chk(w.merges[ii], w.wtypes[ii + 1], maxspaces)

            # Union clips if necessary
            mels = dh.unique([w.line.ptxt.textel] + mels)
            if len(mels) > 1:
                clips = [el.get_link("clip-path") for el in mels]
                if any([c is None for c in clips]):
                    w.line.ptxt.textel.set("clip-path", None)
                else:
                    # Duplicate main clip
                    dc = clips[0].duplicate()
                    wt = mels[0].ccomposed_transform
                    for ii in range(1, len(mels)):
                        # Duplicate merged clip, group contents, move to main dupe
                        dc2 = clips[ii].duplicate()
                        ng = dh.group(list(dc2))
                        dc.append(ng)
                        ng.ctransform = (-wt) @ mels[ii].ccomposed_transform
                        dc2.delete()
                    mels[0].set("clip-path", dc.get_id(2))


# Check if text represents a number
ncs = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "E", "-", "−", ","]


def isnumeric(s, countminus=False):
    s = (
        s.strip().replace("−", "-").replace(",", "")
    )  # strip whitespaces, replace minus signs with -, remove commas
    if countminus and s == "-":  # count a minus sign as a number
        return True
    try:
        float(s)
        return True
    except ValueError:
        return False


# Strip whitespaces
def wstrip(txt):
    return txt.translate({ord(c): None for c in " \n\t\r"})


def twospaces(w1txt, w2txt):
    if (
        (w1txt is not None and len(w1txt) > 1 and w1txt[-2:] == "  ")
        or (
            w1txt is not None
            and len(w1txt) > 0
            and w1txt[-1:] == " "
            and w2txt is not None
            and len(w2txt) > 0
            and w2txt[0] == " "
        )
        or (w2txt is not None and len(w2txt) > 1 and w2txt[:1] == "  ")
    ):
        return True  # resultant chunk has two spaces
    return False


def trailing_leading(wtxt, w2txt):
    trl_spcs = sum([all([c == " " for c in wtxt[ii:]]) for ii in range(len(wtxt))])
    ldg_spcs = sum(
        [all([c == " " for c in w2txt[: ii + 1]]) for ii in range(len(w2txt))]
    )
    return trl_spcs, ldg_spcs