Module atom: correction + ajout de méthodes pour Buffer et State; Module scanner: ajout d’une fonction pour scanner les champs de manière brute

2024-03-02 14:26:17 +01:00 · 2024-03-02 14:26:17 +01:00 · 3307dec97c
commit 3307dec97c
parent 974a160bfb
4 changed files with 208 additions and 31 deletions
--- a/shell/scanner/scanner.go
+++ b/shell/scanner/scanner.go
@ -18,6 +18,7 @@ type tk struct {
 	quote  bool
 	escape bool
 	spaces []rune
+	raw    bool
 }

 func runeToBytes(r rune) []byte {
@ -27,6 +28,64 @@ func runeToBytes(r rune) []byte {
 	return b
 }

+func (t tk) spl(q, e, s collection.Set[rune], start int, data []byte) (advance int, token []byte, esc, quote rune, done bool) {
+	// Scan until space, marking end of word.
+	for width, i := 0, start; i < len(data); i += width {
+		var r rune
+		r, width = utf8.DecodeRune(data[i:])
+		if s.Contains(r) && !q.Contains(quote) && !e.Contains(esc) {
+			return i + width, token, esc, quote, true
+		}
+
+		if e.Contains(esc) {
+			if q.Contains(quote) && !e.Contains(r) && r != quote {
+				token = append(token, runeToBytes(esc)...)
+			}
+			token = append(token, runeToBytes(r)...)
+			esc = 0
+		} else if e.Contains(r) {
+			esc = r
+		} else if q.Contains(r) {
+			if !q.Contains(quote) {
+				quote = r
+			} else if quote == r {
+				quote = 0
+			} else {
+				token = append(token, runeToBytes(r)...)
+			}
+		} else {
+			token = append(token, runeToBytes(r)...)
+		}
+	}
+
+	return
+}
+
+func (t tk) rawSpl(q, e, s collection.Set[rune], start int, data []byte) (advance int, token []byte, esc, quote rune, done bool) {
+	// Scan until space, marking end of word.
+	for width, i := 0, start; i < len(data); i += width {
+		var r rune
+		r, width = utf8.DecodeRune(data[i:])
+		if s.Contains(r) && !q.Contains(quote) && !e.Contains(esc) {
+			return i + width, token, esc, quote, true
+		}
+		token = append(token, runeToBytes(r)...)
+		if e.Contains(esc) {
+			esc = 0
+		} else if e.Contains(r) {
+			esc = r
+		} else if q.Contains(quote) {
+			if quote == r {
+				quote = 0
+			}
+		} else if q.Contains(r) {
+			quote = r
+		}
+	}
+
+	return
+}
+
 func (t tk) Split(data []byte, atEOF bool) (advance int, token []byte, err error) {
 	q, e := collection.NewSet[rune](), collection.NewSet[rune]()
 	s := collection.NewSet(t.spaces...)
@ -47,35 +106,19 @@ func (t tk) Split(data []byte, atEOF bool) (advance int, token []byte, err error
 		}
 	}

-	var quote, esc rune
-
 	// Scan until space, marking end of word.
-	for width, i := 0, start; i < len(data); i += width {
-		var r rune
-		r, width = utf8.DecodeRune(data[i:])
-		if s.Contains(r) && !q.Contains(quote) && !e.Contains(esc) {
-			return i + width, token, nil
-		}
-		if e.Contains(esc) {
-			if q.Contains(quote) && !e.Contains(r) && r != quote {
-				token = append(token, runeToBytes(esc)...)
-			}
-			token = append(token, runeToBytes(r)...)
-			esc = 0
-		} else if e.Contains(r) {
-			esc = r
-		} else if q.Contains(r) {
-			if !q.Contains(quote) {
-				quote = r
-			} else if quote == r {
-				quote = 0
-			} else {
-				token = append(token, runeToBytes(r)...)
-			}
-		} else {
-			token = append(token, runeToBytes(r)...)
-		}
+	var ok bool
+	var esc, quote rune
+	if t.raw {
+		advance, token, esc, quote, ok = t.rawSpl(q, e, s, start, data)
+	} else {
+		advance, token, esc, quote, ok = t.spl(q, e, s, start, data)
 	}
+
+	if ok {
+		return
+	}
+
 	// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
 	if atEOF && len(data) > start {
 		if e.Contains(esc) || q.Contains(quote) {
@ -97,8 +140,8 @@ func (t tk) Split(data []byte, atEOF bool) (advance int, token []byte, err error
 //
 // Le résultat va être décomposé en 3 éléments :
 // - unmot
-// - une\ phrase
-// - "une deuxième\" phrase"
+// - une phrase
+// - une deuxième phrase
 func NewTokenizer(quote, escape bool, spaces ...rune) Tokenizer {
 	if len(spaces) == 0 {
 		spaces = []rune(" \t\n\v\f\r")
@ -110,6 +153,30 @@ func NewTokenizer(quote, escape bool, spaces ...rune) Tokenizer {
 	}
 }

+// NewRawTokenizer retourne un tokenizer d’arguments de ligne de commande.
+//
+// Le split s’effectue au niveau des caractères spaces fourni (tous les espaces
+// si aucun de fourni) sauf si ces caractères sont échappés
+// (si escape) ou entre guillemets (si quote)
+// Par exemple, prenons la chaîne suivante :
+// unmot une\ phrase "une deuxième\" phrase"
+//
+// Le résultat va être décomposé en 3 éléments :
+// - unmot
+// - une\ phrase
+// - "une deuxième\" phrase"
+func NewRawTokenizer(quote, escape bool, spaces ...rune) Tokenizer {
+	if len(spaces) == 0 {
+		spaces = []rune(" \t\n\v\f\r")
+	}
+	return tk{
+		quote:  quote,
+		escape: escape,
+		spaces: spaces,
+		raw:    true,
+	}
+}
+
 // NewScanner retourne un nouveau scanner utilisant le tokenizer spécifié
 // pour la fonction de splitage (NewTokenizer(true, true) si aucun tokenizer fourni).
 func NewScanner(r io.Reader, t ...Tokenizer) *bufio.Scanner {
@ -123,3 +190,8 @@ func NewScanner(r io.Reader, t ...Tokenizer) *bufio.Scanner {
 	sc.Split(s)
 	return sc
 }
+
+// NewRawScanner retourne un scanner utilisant RawTokenizer avec détection des quotes et de l’échappement.
+func NewRawScanner(r io.Reader) *bufio.Scanner {
+	return NewScanner(r, NewRawTokenizer(true, true))
+}