2023-09-24 19:58:31 +00:00
|
|
|
|
package scanner
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bufio"
|
|
|
|
|
"errors"
|
|
|
|
|
"io"
|
|
|
|
|
"unicode/utf8"
|
|
|
|
|
|
|
|
|
|
"gitea.zaclys.com/bvaudour/gob/collection"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// Tokenizer est une interface implémentant une fonction de splittage.
|
|
|
|
|
type Tokenizer interface {
|
|
|
|
|
Split(data []byte, atEOF bool) (advance int, token []byte, err error)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type tk struct {
|
|
|
|
|
quote bool
|
|
|
|
|
escape bool
|
|
|
|
|
spaces []rune
|
2024-03-02 13:26:17 +00:00
|
|
|
|
raw bool
|
2023-09-24 19:58:31 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func runeToBytes(r rune) []byte {
|
|
|
|
|
l := utf8.RuneLen(r)
|
|
|
|
|
b := make([]byte, l)
|
|
|
|
|
utf8.EncodeRune(b, r)
|
|
|
|
|
return b
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-02 13:26:17 +00:00
|
|
|
|
func (t tk) spl(q, e, s collection.Set[rune], start int, data []byte) (advance int, token []byte, esc, quote rune, done bool) {
|
2023-09-24 19:58:31 +00:00
|
|
|
|
// Scan until space, marking end of word.
|
|
|
|
|
for width, i := 0, start; i < len(data); i += width {
|
|
|
|
|
var r rune
|
|
|
|
|
r, width = utf8.DecodeRune(data[i:])
|
|
|
|
|
if s.Contains(r) && !q.Contains(quote) && !e.Contains(esc) {
|
2024-03-02 13:26:17 +00:00
|
|
|
|
return i + width, token, esc, quote, true
|
2023-09-24 19:58:31 +00:00
|
|
|
|
}
|
2024-03-02 13:26:17 +00:00
|
|
|
|
|
2023-09-24 19:58:31 +00:00
|
|
|
|
if e.Contains(esc) {
|
|
|
|
|
if q.Contains(quote) && !e.Contains(r) && r != quote {
|
|
|
|
|
token = append(token, runeToBytes(esc)...)
|
|
|
|
|
}
|
|
|
|
|
token = append(token, runeToBytes(r)...)
|
|
|
|
|
esc = 0
|
|
|
|
|
} else if e.Contains(r) {
|
|
|
|
|
esc = r
|
|
|
|
|
} else if q.Contains(r) {
|
|
|
|
|
if !q.Contains(quote) {
|
|
|
|
|
quote = r
|
|
|
|
|
} else if quote == r {
|
|
|
|
|
quote = 0
|
|
|
|
|
} else {
|
|
|
|
|
token = append(token, runeToBytes(r)...)
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
token = append(token, runeToBytes(r)...)
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-03-02 13:26:17 +00:00
|
|
|
|
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (t tk) rawSpl(q, e, s collection.Set[rune], start int, data []byte) (advance int, token []byte, esc, quote rune, done bool) {
|
|
|
|
|
// Scan until space, marking end of word.
|
|
|
|
|
for width, i := 0, start; i < len(data); i += width {
|
|
|
|
|
var r rune
|
|
|
|
|
r, width = utf8.DecodeRune(data[i:])
|
|
|
|
|
if s.Contains(r) && !q.Contains(quote) && !e.Contains(esc) {
|
|
|
|
|
return i + width, token, esc, quote, true
|
|
|
|
|
}
|
|
|
|
|
token = append(token, runeToBytes(r)...)
|
|
|
|
|
if e.Contains(esc) {
|
|
|
|
|
esc = 0
|
|
|
|
|
} else if e.Contains(r) {
|
|
|
|
|
esc = r
|
|
|
|
|
} else if q.Contains(quote) {
|
|
|
|
|
if quote == r {
|
|
|
|
|
quote = 0
|
|
|
|
|
}
|
|
|
|
|
} else if q.Contains(r) {
|
|
|
|
|
quote = r
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (t tk) Split(data []byte, atEOF bool) (advance int, token []byte, err error) {
|
|
|
|
|
q, e := collection.NewSet[rune](), collection.NewSet[rune]()
|
|
|
|
|
s := collection.NewSet(t.spaces...)
|
|
|
|
|
if t.quote {
|
|
|
|
|
q.Add('\'', '"')
|
|
|
|
|
}
|
|
|
|
|
if t.escape {
|
|
|
|
|
e.Add('\\')
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Skip leading spaces.
|
|
|
|
|
start := 0
|
|
|
|
|
for width := 0; start < len(data); start += width {
|
|
|
|
|
var r rune
|
|
|
|
|
r, width = utf8.DecodeRune(data[start:])
|
|
|
|
|
if !s.Contains(r) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Scan until space, marking end of word.
|
|
|
|
|
var ok bool
|
|
|
|
|
var esc, quote rune
|
|
|
|
|
if t.raw {
|
|
|
|
|
advance, token, esc, quote, ok = t.rawSpl(q, e, s, start, data)
|
|
|
|
|
} else {
|
|
|
|
|
advance, token, esc, quote, ok = t.spl(q, e, s, start, data)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ok {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-24 19:58:31 +00:00
|
|
|
|
// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
|
|
|
|
|
if atEOF && len(data) > start {
|
|
|
|
|
if e.Contains(esc) || q.Contains(quote) {
|
|
|
|
|
return start, nil, errors.New("Incomplete token")
|
|
|
|
|
}
|
|
|
|
|
return len(data), token, nil
|
|
|
|
|
}
|
|
|
|
|
// Request more data.
|
|
|
|
|
return start, nil, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NewTokenizer retourne un tokenizer d’arguments de ligne de commande.
|
|
|
|
|
//
|
|
|
|
|
// Le split s’effectue au niveau des caractères spaces fourni (tous les espaces
|
|
|
|
|
// si aucun de fourni) sauf si ces caractères sont échappés
|
|
|
|
|
// (si escape) ou entre guillemets (si quote)
|
|
|
|
|
// Par exemple, prenons la chaîne suivante :
|
|
|
|
|
// unmot une\ phrase "une deuxième\" phrase"
|
|
|
|
|
//
|
|
|
|
|
// Le résultat va être décomposé en 3 éléments :
|
|
|
|
|
// - unmot
|
2024-03-02 13:26:17 +00:00
|
|
|
|
// - une phrase
|
|
|
|
|
// - une deuxième phrase
|
|
|
|
|
func NewTokenizer(quote, escape bool, spaces ...rune) Tokenizer {
|
|
|
|
|
if len(spaces) == 0 {
|
|
|
|
|
spaces = []rune(" \t\n\v\f\r")
|
|
|
|
|
}
|
|
|
|
|
return tk{
|
|
|
|
|
quote: quote,
|
|
|
|
|
escape: escape,
|
|
|
|
|
spaces: spaces,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NewRawTokenizer retourne un tokenizer d’arguments de ligne de commande.
|
|
|
|
|
//
|
|
|
|
|
// Le split s’effectue au niveau des caractères spaces fourni (tous les espaces
|
|
|
|
|
// si aucun de fourni) sauf si ces caractères sont échappés
|
|
|
|
|
// (si escape) ou entre guillemets (si quote)
|
|
|
|
|
// Par exemple, prenons la chaîne suivante :
|
|
|
|
|
// unmot une\ phrase "une deuxième\" phrase"
|
|
|
|
|
//
|
|
|
|
|
// Le résultat va être décomposé en 3 éléments :
|
|
|
|
|
// - unmot
|
2023-09-24 19:58:31 +00:00
|
|
|
|
// - une\ phrase
|
|
|
|
|
// - "une deuxième\" phrase"
|
2024-03-02 13:26:17 +00:00
|
|
|
|
func NewRawTokenizer(quote, escape bool, spaces ...rune) Tokenizer {
|
2023-09-24 19:58:31 +00:00
|
|
|
|
if len(spaces) == 0 {
|
|
|
|
|
spaces = []rune(" \t\n\v\f\r")
|
|
|
|
|
}
|
|
|
|
|
return tk{
|
|
|
|
|
quote: quote,
|
|
|
|
|
escape: escape,
|
|
|
|
|
spaces: spaces,
|
2024-03-02 13:26:17 +00:00
|
|
|
|
raw: true,
|
2023-09-24 19:58:31 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NewScanner retourne un nouveau scanner utilisant le tokenizer spécifié
|
|
|
|
|
// pour la fonction de splitage (NewTokenizer(true, true) si aucun tokenizer fourni).
|
|
|
|
|
func NewScanner(r io.Reader, t ...Tokenizer) *bufio.Scanner {
|
|
|
|
|
var s bufio.SplitFunc
|
|
|
|
|
if len(t) > 0 {
|
|
|
|
|
s = t[0].Split
|
|
|
|
|
} else {
|
|
|
|
|
s = (NewTokenizer(true, true)).Split
|
|
|
|
|
}
|
|
|
|
|
sc := bufio.NewScanner(r)
|
|
|
|
|
sc.Split(s)
|
|
|
|
|
return sc
|
|
|
|
|
}
|
2024-03-02 13:26:17 +00:00
|
|
|
|
|
|
|
|
|
// NewRawScanner retourne un scanner utilisant RawTokenizer avec détection des quotes et de l’échappement.
|
|
|
|
|
func NewRawScanner(r io.Reader) *bufio.Scanner {
|
|
|
|
|
return NewScanner(r, NewRawTokenizer(true, true))
|
|
|
|
|
}
|