// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package importutil import ( "bytes" "encoding/binary" "errors" "fmt" "io" "io/ioutil" "os" "reflect" "regexp" "strconv" "unicode/utf8" "github.com/milvus-io/milvus-proto/go-api/schemapb" "github.com/milvus-io/milvus/internal/log" "github.com/sbinet/npyio" "github.com/sbinet/npyio/npy" "go.uber.org/zap" ) var ( reStrPre = regexp.MustCompile(`^[|]*?(\d.*)[Sa]$`) reStrPost = regexp.MustCompile(`^[|]*?[Sa](\d.*)$`) reUniPre = regexp.MustCompile(`^[<|>]*?(\d.*)U$`) reUniPost = regexp.MustCompile(`^[<|>]*?U(\d.*)$`) ) func CreateNumpyFile(path string, data interface{}) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() err = npyio.Write(f, data) if err != nil { return err } return nil } func CreateNumpyData(data interface{}) ([]byte, error) { buf := new(bytes.Buffer) err := npyio.Write(buf, data) if err != nil { return nil, err } return buf.Bytes(), nil } // NumpyAdapter is the class to expand other numpy lib ability // we evaluate two go-numpy lins: github.com/kshedden/gonpy and github.com/sbinet/npyio // the npyio lib read data one by one, the performance is poor, we expand the read methods // to read data in one batch, the performance is 100X faster // the gonpy lib also read data in one batch, but it has no method to read bool data, and the ability // to handle different data type is not strong as the npylib, so we choose the npyio lib to expand. type NumpyAdapter struct { reader io.Reader // data source, typically is os.File npyReader *npy.Reader // reader of npyio lib order binary.ByteOrder // LittleEndian or BigEndian readPosition int // how many elements have been read dataType schemapb.DataType // data type parsed from numpy file header } func NewNumpyAdapter(reader io.Reader) (*NumpyAdapter, error) { r, err := npyio.NewReader(reader) if err != nil { return nil, err } dataType, err := convertNumpyType(r.Header.Descr.Type) if err != nil { return nil, err } adapter := &NumpyAdapter{ reader: reader, npyReader: r, readPosition: 0, dataType: dataType, } adapter.setByteOrder() log.Info("Numpy adapter: numpy header info", zap.Any("shape", r.Header.Descr.Shape), zap.String("dType", r.Header.Descr.Type), zap.Uint8("majorVer", r.Header.Major), zap.Uint8("minorVer", r.Header.Minor), zap.String("ByteOrder", adapter.order.String())) return adapter, err } // convertNumpyType gets data type converted from numpy header description, for vector field, the type is int8(binary vector) or float32(float vector) func convertNumpyType(typeStr string) (schemapb.DataType, error) { log.Info("Numpy adapter: parse numpy file dtype", zap.String("dtype", typeStr)) switch typeStr { case "b1", "i1", "int8": return schemapb.DataType_Int8, nil case "i2", "i2", "int16": return schemapb.DataType_Int16, nil case "i4", "i4", "int32": return schemapb.DataType_Int32, nil case "i8", "i8", "int64": return schemapb.DataType_Int64, nil case "f4", "f4", "float32": return schemapb.DataType_Float, nil case "f8", "f8", "float64": return schemapb.DataType_Double, nil default: if isStringType(typeStr) { return schemapb.DataType_VarChar, nil } log.Error("Numpy adapter: the numpy file data type not supported", zap.String("dataType", typeStr)) return schemapb.DataType_None, fmt.Errorf("Numpy adapter: the numpy file dtype '%s' is not supported", typeStr) } } func stringLen(dtype string) (int, bool, error) { var utf bool switch { case reStrPre.MatchString(dtype), reStrPost.MatchString(dtype): utf = false case reUniPre.MatchString(dtype), reUniPost.MatchString(dtype): utf = true } if m := reStrPre.FindStringSubmatch(dtype); m != nil { v, err := strconv.Atoi(m[1]) if err != nil { return 0, false, err } return v, utf, nil } if m := reStrPost.FindStringSubmatch(dtype); m != nil { v, err := strconv.Atoi(m[1]) if err != nil { return 0, false, err } return v, utf, nil } if m := reUniPre.FindStringSubmatch(dtype); m != nil { v, err := strconv.Atoi(m[1]) if err != nil { return 0, false, err } return v, utf, nil } if m := reUniPost.FindStringSubmatch(dtype); m != nil { v, err := strconv.Atoi(m[1]) if err != nil { return 0, false, err } return v, utf, nil } return 0, false, fmt.Errorf("Numpy adapter: data type '%s' of numpy file is not varchar data type", dtype) } func isStringType(typeStr string) bool { rt := npyio.TypeFrom(typeStr) return rt == reflect.TypeOf((*string)(nil)).Elem() } // setByteOrder sets BigEndian/LittleEndian, the logic of this method is copied from npyio lib func (n *NumpyAdapter) setByteOrder() { var nativeEndian binary.ByteOrder v := uint16(1) switch byte(v >> 8) { case 0: nativeEndian = binary.LittleEndian case 1: nativeEndian = binary.BigEndian } switch n.npyReader.Header.Descr.Type[0] { case '<': n.order = binary.LittleEndian case '>': n.order = binary.BigEndian default: n.order = nativeEndian } } func (n *NumpyAdapter) Reader() io.Reader { return n.reader } func (n *NumpyAdapter) NpyReader() *npy.Reader { return n.npyReader } func (n *NumpyAdapter) GetType() schemapb.DataType { return n.dataType } func (n *NumpyAdapter) GetShape() []int { return n.npyReader.Header.Descr.Shape } func (n *NumpyAdapter) checkCount(count int) int { shape := n.GetShape() // empty file? if len(shape) == 0 { return 0 } total := 1 for i := 0; i < len(shape); i++ { total *= shape[i] } if total == 0 { return 0 } // overflow? if count > (total - n.readPosition) { return total - n.readPosition } return count } func (n *NumpyAdapter) ReadBool(count int) ([]bool, error) { if count <= 0 { return nil, errors.New("Numpy adapter: cannot read bool data with a zero or nagative count") } // incorrect type if n.dataType != schemapb.DataType_Bool { return nil, errors.New("Numpy adapter: numpy data is not bool type") } // avoid read overflow readSize := n.checkCount(count) if readSize <= 0 { return nil, errors.New("Numpy adapter: end of bool file, nothing to read") } // read data data := make([]bool, readSize) err := binary.Read(n.reader, n.order, &data) if err != nil { return nil, fmt.Errorf("Numpy adapter: failed to read bool data with count %d, error: %w", readSize, err) } // update read position after successfully read n.readPosition += readSize return data, nil } func (n *NumpyAdapter) ReadUint8(count int) ([]uint8, error) { if count <= 0 { return nil, errors.New("Numpy adapter: cannot read uint8 data with a zero or nagative count") } // incorrect type // here we don't use n.dataType to check because currently milvus has no uint8 type switch n.npyReader.Header.Descr.Type { case "u1", " 0 { r, _ := utf8.DecodeRune(raw) if r == utf8.RuneError { log.Error("Numpy adapter: failed to decode utf8 string from numpy file", zap.Any("raw", raw[:utf8.UTFMax])) return nil, fmt.Errorf("Numpy adapter: failed to decode utf8 string from numpy file, error: illegal utf-8 encoding") } // only support ascii characters, because the numpy lib encode the utf8 bytes by its internal method, // the encode/decode logic is not clear now, return error n := n.order.Uint32(raw) if n > 127 { log.Error("Numpy adapter: a string contains non-ascii characters, not support yet", zap.Int32("utf8Code", r)) return nil, fmt.Errorf("Numpy adapter: a string contains non-ascii characters, not support yet") } // if a string is shorter than maxLen, the tail characters will be filled with "\u0000"(in utf spec this is Null) if r > 0 { str += string(r) } raw = raw[utf8.UTFMax:] } data = append(data, str) } else { buf, err := ioutil.ReadAll(io.LimitReader(n.reader, int64(maxLen))) if err != nil { log.Error("Numpy adapter: failed to read string from numpy file", zap.Int("i", i), zap.Any("err", err)) return nil, fmt.Errorf("Numpy adapter: failed to read string from numpy file, error: %w", err) } n := bytes.Index(buf, []byte{0}) if n > 0 { buf = buf[:n] } data = append(data, string(buf)) } } // update read position after successfully read n.readPosition += readSize return data, nil }