enhance: Convert unincode to ascii to improving expression parsing efficiency (#36675)

issue: #36672

---------

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
This commit is contained in:
cai.zhang 2024-10-09 09:23:24 +08:00 committed by GitHub
parent 2ec6e602d6
commit fc8b5ab791
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 111 additions and 2 deletions

View File

@ -131,7 +131,7 @@ func (v *ParserVisitor) VisitFloating(ctx *parser.FloatingContext) interface{} {
// VisitString translates expr to GenericValue.
func (v *ParserVisitor) VisitString(ctx *parser.StringContext) interface{} {
pattern, err := convertEscapeSingle(ctx.StringLiteral().GetText())
pattern, err := convertEscapeSingle(ctx.GetText())
if err != nil {
return err
}

View File

@ -2,7 +2,9 @@ package planparserv2
import (
"fmt"
"strings"
"time"
"unicode"
"github.com/antlr/antlr4/runtime/Go/antlr"
"github.com/hashicorp/golang-lru/v2/expirable"
@ -126,7 +128,39 @@ func CreateRetrievePlan(schema *typeutil.SchemaHelper, exprStr string) (*planpb.
return planNode, nil
}
func convertHanToASCII(s string) string {
var builder strings.Builder
builder.Grow(len(s) * 6)
skipCur := false
n := len(s)
for i, r := range s {
if skipCur {
builder.WriteRune(r)
skipCur = false
continue
}
if r == '\\' {
if i+1 < n && !isEscapeCh(s[i+1]) {
return s
}
skipCur = true
builder.WriteRune(r)
continue
}
if unicode.Is(unicode.Han, r) {
builder.WriteString(formatUnicode(uint32(r)))
} else {
builder.WriteRune(r)
}
}
return builder.String()
}
func CreateSearchPlan(schema *typeutil.SchemaHelper, exprStr string, vectorFieldName string, queryInfo *planpb.QueryInfo) (*planpb.PlanNode, error) {
exprStr = convertHanToASCII(exprStr)
parse := func() (*planpb.Expr, error) {
if len(exprStr) <= 0 {
return nil, nil

View File

@ -2,6 +2,7 @@ package planparserv2
import (
"fmt"
"math/rand"
"sync"
"testing"
@ -1053,13 +1054,14 @@ c'`,
`A == "\中国"`,
}
for _, expr = range invalidExprs {
_, err = CreateSearchPlan(schema, expr, "FloatVectorField", &planpb.QueryInfo{
plan, err := CreateSearchPlan(schema, expr, "FloatVectorField", &planpb.QueryInfo{
Topk: 0,
MetricType: "",
SearchParams: "",
RoundDecimal: 0,
})
assert.Error(t, err)
fmt.Println(plan)
}
}
@ -1338,3 +1340,54 @@ func BenchmarkNoPlanCache(b *testing.B) {
assert.NoError(b, err)
}
}
func randomChineseString(length int) string {
min := 0x4e00
max := 0x9fa5
result := make([]rune, length)
for i := 0; i < length; i++ {
result[i] = rune(rand.Intn(max-min+1) + min)
}
return string(result)
}
func BenchmarkWithString(b *testing.B) {
schema := newTestSchema()
schemaHelper, err := typeutil.CreateSchemaHelper(schema)
require.NoError(b, err)
expr := ""
for i := 0; i < 100; i++ {
expr += fmt.Sprintf(`"%s",`, randomChineseString(rand.Intn(100)))
}
expr = "StringField in [" + expr + "]"
for i := 0; i < b.N; i++ {
plan, err := CreateSearchPlan(schemaHelper, expr, "FloatVectorField", &planpb.QueryInfo{
Topk: 0,
MetricType: "",
SearchParams: "",
RoundDecimal: 0,
})
assert.NoError(b, err)
assert.NotNil(b, plan)
}
}
func Test_convertHanToASCII(t *testing.T) {
type testcase struct {
source string
target string
}
testcases := []testcase{
{`A in ["中国"]`, `A in ["\u4e2d\u56fd"]`},
{`A in ["\中国"]`, `A in ["\中国"]`},
{`A in ["\\中国"]`, `A in ["\\\u4e2d\u56fd"]`},
}
for _, c := range testcases {
assert.Equal(t, c.target, convertHanToASCII(c.source))
}
}

View File

@ -539,3 +539,25 @@ func isIntegerColumn(col *planpb.ColumnInfo) bool {
(typeutil.IsArrayType(col.GetDataType()) && typeutil.IsIntegerType(col.GetElementType())) ||
typeutil.IsJSONType(col.GetDataType())
}
func isEscapeCh(ch uint8) bool {
return ch == '\\' || ch == 'n' || ch == 't' || ch == 'r' || ch == 'f' || ch == '"' || ch == '\''
}
func formatUnicode(r uint32) string {
return string([]byte{
'\\', 'u',
hexDigit(r >> 12),
hexDigit(r >> 8),
hexDigit(r >> 4),
hexDigit(r),
})
}
func hexDigit(n uint32) byte {
n &= 0xf
if n < 10 {
return byte(n) + '0'
}
return byte(n-10) + 'a'
}