Module shell (1)

2023-09-24 21:58:31 +02:00 · 2023-09-24 21:58:31 +02:00 · 898822d78f
commit 898822d78f
parent 57f6071552
3 changed files with 646 additions and 0 deletions
--- a/shell/scanner/scanner.go
+++ b/shell/scanner/scanner.go
@ -0,0 +1,125 @@
+package scanner
+
+import (
+	"bufio"
+	"errors"
+	"io"
+	"unicode/utf8"
+
+	"gitea.zaclys.com/bvaudour/gob/collection"
+)
+
+// Tokenizer est une interface implémentant une fonction de splittage.
+type Tokenizer interface {
+	Split(data []byte, atEOF bool) (advance int, token []byte, err error)
+}
+
+type tk struct {
+	quote  bool
+	escape bool
+	spaces []rune
+}
+
+func runeToBytes(r rune) []byte {
+	l := utf8.RuneLen(r)
+	b := make([]byte, l)
+	utf8.EncodeRune(b, r)
+	return b
+}
+
+func (t tk) Split(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	q, e := collection.NewSet[rune](), collection.NewSet[rune]()
+	s := collection.NewSet(t.spaces...)
+	if t.quote {
+		q.Add('\'', '"')
+	}
+	if t.escape {
+		e.Add('\\')
+	}
+
+	// Skip leading spaces.
+	start := 0
+	for width := 0; start < len(data); start += width {
+		var r rune
+		r, width = utf8.DecodeRune(data[start:])
+		if !s.Contains(r) {
+			break
+		}
+	}
+
+	var quote, esc rune
+
+	// Scan until space, marking end of word.
+	for width, i := 0, start; i < len(data); i += width {
+		var r rune
+		r, width = utf8.DecodeRune(data[i:])
+		if s.Contains(r) && !q.Contains(quote) && !e.Contains(esc) {
+			return i + width, token, nil
+		}
+		if e.Contains(esc) {
+			if q.Contains(quote) && !e.Contains(r) && r != quote {
+				token = append(token, runeToBytes(esc)...)
+			}
+			token = append(token, runeToBytes(r)...)
+			esc = 0
+		} else if e.Contains(r) {
+			esc = r
+		} else if q.Contains(r) {
+			if !q.Contains(quote) {
+				quote = r
+			} else if quote == r {
+				quote = 0
+			} else {
+				token = append(token, runeToBytes(r)...)
+			}
+		} else {
+			token = append(token, runeToBytes(r)...)
+		}
+	}
+	// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
+	if atEOF && len(data) > start {
+		if e.Contains(esc) || q.Contains(quote) {
+			return start, nil, errors.New("Incomplete token")
+		}
+		return len(data), token, nil
+	}
+	// Request more data.
+	return start, nil, nil
+}
+
+// NewTokenizer retourne un tokenizer d’arguments de ligne de commande.
+//
+// Le split s’effectue au niveau des caractères spaces fourni (tous les espaces
+// si aucun de fourni) sauf si ces caractères sont échappés
+// (si escape) ou entre guillemets (si quote)
+// Par exemple, prenons la chaîne suivante :
+// unmot une\ phrase "une deuxième\" phrase"
+//
+// Le résultat va être décomposé en 3 éléments :
+// - unmot
+// - une\ phrase
+// - "une deuxième\" phrase"
+func NewTokenizer(quote, escape bool, spaces ...rune) Tokenizer {
+	if len(spaces) == 0 {
+		spaces = []rune(" \t\n\v\f\r")
+	}
+	return tk{
+		quote:  quote,
+		escape: escape,
+		spaces: spaces,
+	}
+}
+
+// NewScanner retourne un nouveau scanner utilisant le tokenizer spécifié
+// pour la fonction de splitage (NewTokenizer(true, true) si aucun tokenizer fourni).
+func NewScanner(r io.Reader, t ...Tokenizer) *bufio.Scanner {
+	var s bufio.SplitFunc
+	if len(t) > 0 {
+		s = t[0].Split
+	} else {
+		s = (NewTokenizer(true, true)).Split
+	}
+	sc := bufio.NewScanner(r)
+	sc.Split(s)
+	return sc
+}