Update cases of construct from dataframe (#5980)

* update cases for primary_field and auto_id Signed-off-by: ThreadDao <yufen.zong@zilliz.com> * update cases of construct from dataframe Signed-off-by: ThreadDao <yufen.zong@zilliz.com>
2025-12-28 22:45:26 +08:00 · 2021-06-22 17:12:06 +08:00 · 2021-06-22 17:12:06 +08:00 · 20a3d601ea
commit 20a3d601ea
parent b31bae351d
2 changed files with 209 additions and 66 deletions
--- a/tests20/python_client/check/func_check.py
+++ b/tests20/python_client/check/func_check.py
@ -107,18 +107,16 @@ class ResponseChecker:
            raise Exception("The result to check isn't collection type object")
        if len(check_items) == 0:
            raise Exception("No expect values found in the check task")
-        name = check_items.get("name", None)
-        schema = check_items.get("schema", None)
-        num_entities = check_items.get("num_entities", 0)
-        primary = check_items.get("primary", ct.default_int64_field_name)
-        if name:
-            assert collection.name == name
-        if schema:
-            assert collection.schema == schema
-        if num_entities == 0:
-            assert collection.is_empty
-        assert collection.num_entities == num_entities
-        assert collection.primary_field.name == primary
+        if check_items.get("name", None):
+            assert collection.name == check_items.get("name")
+        if check_items.get("schema", None):
+            assert collection.schema == check_items.get("schema")
+        if check_items.get("num_entities", None):
+            if check_items.get("num_entities") == 0:
+                assert collection.is_empty
+            assert collection.num_entities == check_items.get("num_entities")
+        if check_items.get("primary", None):
+            assert collection.primary_field.name == check_items.get("primary")
        return True

    @staticmethod
--- a/tests20/python_client/testcases/test_collection.py
+++ b/tests20/python_client/testcases/test_collection.py
@ -1,3 +1,4 @@
+import numpy
 import pandas as pd
 import pytest
 from pymilvus import DataType
@ -55,7 +56,8 @@ class TestCollectionParams(TestcaseBase):
        c_name = cf.gen_unique_str(prefix)
        self.collection_wrap.init_collection(c_name, schema=default_schema,
                                             check_task=CheckTasks.check_collection_property,
-                                             check_items={exp_name: c_name, exp_schema: default_schema})
+                                             check_items={exp_name: c_name, exp_schema: default_schema, exp_num: 0,
+                                                          exp_primary: ct.default_int64_field_name})
        assert c_name, _ in self.utility_wrap.list_collections()

    @pytest.mark.tags(CaseLabel.L0)
@ -351,7 +353,8 @@ class TestCollectionParams(TestcaseBase):
        """
        self._connect()
        c_name = cf.gen_unique_str(prefix)
-        field, _ = self.field_schema_wrap.init_field_schema(name=ct.default_int64_field_name, dtype=5.0, is_primary=True)
+        field, _ = self.field_schema_wrap.init_field_schema(name=ct.default_int64_field_name, dtype=5.0,
+                                                            is_primary=True)
        schema = cf.gen_collection_schema(fields=[field, cf.gen_float_vec_field()])
        error = {ct.err_code: 0, ct.err_msg: "Field type must be of DataType"}
        self.collection_wrap.init_collection(c_name, schema=schema, check_task=CheckTasks.err_res, check_items=error)
@ -387,6 +390,7 @@ class TestCollectionParams(TestcaseBase):
        assert not self.utility_wrap.has_collection(c_name)[0]

    @pytest.mark.tags(CaseLabel.L0)
+    @pytest.mark.skip(reason="waiting for required int primary field")
    @pytest.mark.parametrize("field", [cf.gen_float_vec_field(), cf.gen_binary_vec_field()])
    def test_collection_only_vector_field(self, field):
        """
@ -707,36 +711,22 @@ class TestCollectionParams(TestcaseBase):
        assert schema.auto_id == auto_id

    @pytest.mark.tags(CaseLabel.L1)
-    @pytest.mark.xfail(reason="#5945")
    def test_collection_auto_id_none_in_field(self):
        """
        target: test collection with auto_id is None
        method: set auto_id=None
-        expected: todo
+        expected: raise exception
        """
        self._connect()
-        int_field = cf.gen_int64_field(is_primary=True, auto_id=None)
-        vec_field = cf.gen_float_vec_field(name='vec')
-        schema, _ = self.collection_schema_wrap.init_collection_schema([int_field, vec_field])
-        log.debug(schema.auto_id)
+        error = {ct.err_code: 0, ct.err_msg: "Param auto_id must be bool type"}
+        self.field_schema_wrap.init_field_schema(name=ct.default_int64_field_name, dtype=DataType.INT64,
+                                                 is_primary=True,
+                                                 auto_id=None, check_task=CheckTasks.err_res, check_items=error)

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.xfail(reason="#5945")
-    def test_collection_auto_id_none_in_schema(self):
-        """
-        target: test collection with auto_id=None in collection schema
-        method: set auto_id=None in collection schema
-        expected: todo
-        """
-        self._connect()
-        int_field = cf.gen_int64_field(is_primary=True)
-        vec_field = cf.gen_float_vec_field(name='vec')
-        schema, _ = self.collection_schema_wrap.init_collection_schema([int_field, vec_field], auto_id=None)
-        log.debug(schema.auto_id)
-
-    @pytest.mark.tags(CaseLabel.L1)
-    @pytest.mark.todo(reason="if #5945 throw exception, then remove test_collection_auto_id_none_in_schema")
-    def test_collection_invalid_auto_id(self, get_none_removed_invalid_strings):
+    @pytest.mark.parametrize("auto_id", ct.get_invalid_strs)
+    def test_collection_invalid_auto_id(self, auto_id):
        """
        target: test collection with invalid auto_id
        method: define field with auto_id=non-bool
@ -745,7 +735,6 @@ class TestCollectionParams(TestcaseBase):
        self._connect()
        int_field = cf.gen_int64_field(is_primary=True)
        vec_field = cf.gen_float_vec_field(name='vec')
-        auto_id = get_none_removed_invalid_strings
        error = {ct.err_code: 0, ct.err_msg: "Param auto_id must be bool type"}
        self.collection_schema_wrap.init_collection_schema([int_field, vec_field], auto_id=auto_id,
                                                           check_task=CheckTasks.err_res, check_items=error)
@ -965,7 +954,6 @@ class TestCollectionOperation(TestcaseBase):
        assert self.utility_wrap.has_collection(c_name)[0]


-@pytest.mark.skip(reason="waiting for debug")
 class TestCollectionDataframe(TestcaseBase):
    """
    ******************************************************************
@ -980,7 +968,7 @@ class TestCollectionDataframe(TestcaseBase):
        yield request.param

    @pytest.mark.tags(CaseLabel.L0)
-    @pytest.mark.skip(reason="waiting for primary field")
+    @pytest.mark.xfail(reason="issue #5947")
    def test_construct_from_dataframe(self):
        """
        target: test collection with dataframe data
@ -990,27 +978,25 @@ class TestCollectionDataframe(TestcaseBase):
        conn = self._connect()
        c_name = cf.gen_unique_str(prefix)
        df = cf.gen_default_dataframe_data(ct.default_nb)
-        self.collection_wrap.construct_from_dataframe(c_name, df,
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
                                                      check_task=CheckTasks.check_collection_property,
                                                      check_items={exp_name: c_name, exp_schema: default_schema})
        conn.flush([c_name])
        assert self.collection_wrap.num_entities == ct.default_nb

    @pytest.mark.tags(CaseLabel.L0)
-    @pytest.mark.xfail(reason="issue #5675")
    def test_construct_from_binary_dataframe(self):
        """
        target: test binary collection with dataframe
        method: create binary collection with dataframe
        expected: collection num entities equal to nb
        """
-        conn = self._connect()
+        self._connect()
        c_name = cf.gen_unique_str(prefix)
        df, _ = cf.gen_default_binary_dataframe_data(nb=ct.default_nb)
-        self.collection_wrap.construct_from_dataframe(c_name, df,
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
                                                      check_task=CheckTasks.check_collection_property,
                                                      check_items={exp_name: c_name, exp_schema: default_binary_schema})
-        conn.flush([c_name])
        assert self.collection_wrap.num_entities == ct.default_nb

    @pytest.mark.tags(CaseLabel.L1)
@ -1025,19 +1011,6 @@ class TestCollectionDataframe(TestcaseBase):
        error = {ct.err_code: 0, ct.err_msg: "Dataframe can not be None!"}
        self.collection_wrap.construct_from_dataframe(c_name, None, check_task=CheckTasks.err_res, check_items=error)

-    @pytest.mark.tags(CaseLabel.L1)
-    def test_construct_from_empty_dataframe(self):
-        """
-        target: test create collection by empty dataframe
-        method: invalid dataframe type create collection
-        expected: raise exception
-        """
-        self._connect()
-        c_name = cf.gen_unique_str(prefix)
-        df = pd.DataFrame()
-        error = {ct.err_code: 0, ct.err_msg: "The field of the schema cannot be empty"}
-        self.collection_wrap.construct_from_dataframe(c_name, df, check_task=CheckTasks.err_res, check_items=error)
-
    @pytest.mark.tags(CaseLabel.L1)
    def test_construct_from_dataframe_only_column(self):
        """
@ -1049,7 +1022,8 @@ class TestCollectionDataframe(TestcaseBase):
        c_name = cf.gen_unique_str(prefix)
        df = pd.DataFrame(columns=[ct.default_int64_field_name, ct.default_float_vec_field_name])
        error = {ct.err_code: 0, ct.err_msg: "Cannot infer schema from empty dataframe"}
-        self.collection_wrap.construct_from_dataframe(c_name, df, check_task=CheckTasks.err_res, check_items=error)
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
+                                                      check_task=CheckTasks.err_res, check_items=error)

    @pytest.mark.tags(CaseLabel.L1)
    def test_construct_from_inconsistent_dataframe(self):
@ -1064,7 +1038,8 @@ class TestCollectionDataframe(TestcaseBase):
        mix_data = [(1, 2., [0.1, 0.2]), (2, 3., 4)]
        df = pd.DataFrame(data=mix_data, columns=list("ABC"))
        error = {ct.err_code: 0, ct.err_msg: "The data in the same column must be of the same type"}
-        self.collection_wrap.construct_from_dataframe(c_name, df, check_task=CheckTasks.err_res, check_items=error)
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field='A', check_task=CheckTasks.err_res,
+                                                      check_items=error)

    @pytest.mark.tags(CaseLabel.L0)
    def test_construct_from_non_dataframe(self, get_non_df):
@ -1080,9 +1055,7 @@ class TestCollectionDataframe(TestcaseBase):
        self.collection_wrap.construct_from_dataframe(c_name, df, check_task=CheckTasks.err_res, check_items=error)

    @pytest.mark.tags(CaseLabel.L1)
-    @pytest.mark.parametrize("df", [pd.DataFrame({"date": pd.date_range('20210101', periods=3)}),
-                                    pd.DataFrame({'%$#': cf.gen_vectors(3, 2)})])
-    def test_construct_from_invalid_dataframe(self, df):
+    def test_construct_from_data_type_dataframe(self):
        """
        target: test collection with invalid dataframe
        method: create with invalid dataframe
@ -1090,11 +1063,182 @@ class TestCollectionDataframe(TestcaseBase):
        """
        self._connect()
        c_name = cf.gen_unique_str(prefix)
-        error = {ct.err_code: 0, ct.err_msg: "Invalid field name"}
-        self.collection_wrap.construct_from_dataframe(c_name, df, check_task=CheckTasks.err_res, check_items=error)
+        df = pd.DataFrame({"date": pd.date_range('20210101', periods=3), ct.default_int64_field_name: [1, 2, 3]})
+        error = {ct.err_code: 0, ct.err_msg: "Cannot infer schema from dataframe"}
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
+                                                      check_task=CheckTasks.err_res, check_items=error)

    @pytest.mark.tags(CaseLabel.L1)
-    @pytest.mark.skip(reason="waiting for primary field")
+    def test_construct_from_invalid_field_name(self):
+        """
+        target: test collection with invalid field name
+        method: create with invalid field name dataframe
+        expected: raise exception
+        """
+        self._connect()
+        c_name = cf.gen_unique_str(prefix)
+        df = pd.DataFrame({'%$#': cf.gen_vectors(3, 2), ct.default_int64_field_name: [1, 2, 3]})
+        error = {ct.err_code: 1, ct.err_msg: "Invalid field name"}
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
+                                                      check_task=CheckTasks.err_res, check_items=error)
+
+    @pytest.mark.tags(CaseLabel.L1)
+    def test_construct_none_primary_field(self):
+        """
+        target: test collection with none primary field
+        method: primary_field is none
+        expected: raise exception
+        """
+        self._connect()
+        c_name = cf.gen_unique_str(prefix)
+        df = cf.gen_default_dataframe_data(ct.default_nb)
+        error = {ct.err_code: 0, ct.err_msg: "Schema must have a primary key field!"}
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=None,
+                                                      check_task=CheckTasks.err_res, check_items=error)
+
+    @pytest.mark.tags(CaseLabel.L1)
+    def test_construct_not_existed_primary_field(self):
+        """
+        target: test collection with not existed primary field
+        method: primary field not existed
+        expected: raise exception
+        """
+        self._connect()
+        c_name = cf.gen_unique_str(prefix)
+        df = cf.gen_default_dataframe_data(ct.default_nb)
+        error = {ct.err_code: 0, ct.err_msg: "Must be have a primary key field"}
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=c_name,
+                                                      check_task=CheckTasks.err_res, check_items=error)
+
+    @pytest.mark.tags(CaseLabel.L1)
+    @pytest.mark.xfail(reason="issue #5945")
+    def test_construct_with_none_auto_id(self):
+        """
+        target: test construct with non-int64 as primary field
+        method: non-int64 as primary field
+        expected: raise exception
+        """
+        self._connect()
+        c_name = cf.gen_unique_str(prefix)
+        df = cf.gen_default_dataframe_data(ct.default_nb)
+        error = {ct.err_code: 0, ct.err_msg: "Must be have a primary key field"}
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
+                                                      auto_id=None)
+        log.debug(self.collection_wrap.schema)
+
+    @pytest.mark.tags(CaseLabel.L0)
+    def test_construct_auto_id_true_insert(self):
+        """
+        target: test construct with true auto_id
+        method: auto_id=True and insert values
+        expected: raise exception
+        """
+        self._connect()
+        c_name = cf.gen_unique_str(prefix)
+        df = cf.gen_default_dataframe_data(nb=100)
+        error = {ct.err_code: 0, ct.err_msg: "Auto_id is True, but get the data of primary key field"}
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
+                                                      auto_id=True, check_task=CheckTasks.err_res, check_items=error)
+
+    @pytest.mark.tags(CaseLabel.L0)
+    @pytest.mark.xfail(reason="#5967")
+    def test_construct_auto_id_true_no_insert(self):
+        """
+        target: test construct with true auto_id
+        method: auto_id=True and not insert ids
+        expected: verify num entities
+        """
+        self._connect()
+        c_name = cf.gen_unique_str(prefix)
+        df = cf.gen_default_dataframe_data(ct.default_nb)
+        df.drop(ct.default_int64_field_name, axis=1, inplace=True)
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
+                                                      auto_id=True)
+        assert self.collection_wrap.num_entities == ct.default_nb
+
+    @pytest.mark.xfail(reason="#5970")
+    @pytest.mark.tags(CaseLabel.L2)
+    def test_construct_none_value_auto_id_true(self):
+        """
+        target: test construct with none value, auto_id
+        method: df primary field with none value, auto_id=true
+        expected: todo
+        """
+        nb = 100
+        df = cf.gen_default_dataframe_data(nb)
+        log.debug(df.head(3))
+        df.iloc[:, 0] = numpy.NaN
+        log.debug(df.head(3))
+        self.collection_wrap.construct_from_dataframe(cf.gen_unique_str(prefix), df,
+                                                      priamry_field=ct.default_int64_field_name, auto_id=True)
+        log.debug(self.collection_wrap.num_entities)
+
+    @pytest.mark.tags(CaseLabel.L0)
+    def test_construct_auto_id_false(self):
+        """
+        target: test construct with false auto_id
+        method: auto_id=False, primary_field correct
+        expected: verify auto_id
+        """
+        self._connect()
+        c_name = cf.gen_unique_str(prefix)
+        df = cf.gen_default_dataframe_data(ct.default_nb)
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
+                                                      auto_id=False)
+        assert not self.collection_wrap.schema.auto_id
+        assert self.collection_wrap.num_entities == ct.default_nb
+
+    @pytest.mark.tags(CaseLabel.L1)
+    def test_construct_none_value_auto_id_false(self):
+        """
+        target: test construct with none value, auto_id
+        method: df primary field with none value, auto_id=false
+        expected: raise exception
+        """
+        self._connect()
+        nb = 100
+        df = cf.gen_default_dataframe_data(nb)
+        df.iloc[:, 0] = numpy.NaN
+        error = {ct.err_code: 0, ct.err_msg: "Primary key type must be DataType.INT64"}
+        self.collection_wrap.construct_from_dataframe(cf.gen_unique_str(prefix), df,
+                                                      priamry_field=ct.default_int64_field_name, auto_id=False,
+                                                      check_task=CheckTasks.err_res, check_items=error)
+
+    @pytest.mark.xfail(reason="#5977")
+    @pytest.mark.tags(CaseLabel.L1)
+    def test_construct_auto_id_false_repeated_values(self):
+        """
+        target: test construct with false auto_id and repeated values
+        method: auto_id=False, primary field values repeated
+        expected: raise exception
+        """
+        self._connect()
+        nb = 100
+        df = cf.gen_default_dataframe_data(nb)
+        df.iloc[1:, 0] = 1
+        # error = {ct.err_code: 0, ct.err_msg: "Primary key type must be DataType.INT64"}
+        self.collection_wrap.construct_from_dataframe(cf.gen_unique_str(prefix), df,
+                                                      priamry_field=ct.default_int64_field_name, auto_id=False)
+        log.debug(self.collection_wrap.num_entities)
+
+    @pytest.mark.tags(CaseLabel.L1)
+    def test_construct_auto_id_false_negative_values(self):
+        """
+        target: test construct with negative values
+        method: auto_id=False, primary field values is negative
+        expected: todo
+        """
+        self._connect()
+        nb = 100
+        df = cf.gen_default_dataframe_data(nb)
+        new_values = pd.Series(data=[i for i in range(0, -nb, -1)])
+        df[ct.default_int64_field_name] = new_values
+        self.collection_wrap.construct_from_dataframe(cf.gen_unique_str(prefix), df,
+                                                      priamry_field=ct.default_int64_field_name, auto_id=False)
+        assert self.collection_wrap.num_entities == nb
+
+    @pytest.mark.tags(CaseLabel.L1)
+    @pytest.mark.xfail(reason="#5947")
    def test_construct_from_dataframe_dup_name(self):
        """
        target: test collection with dup name and insert dataframe
@ -1103,11 +1247,12 @@ class TestCollectionDataframe(TestcaseBase):
        """
        conn = self._connect()
        c_name = cf.gen_unique_str(prefix)
-        collection_w = self.init_collection_wrap(name=c_name,
+        collection_w = self.init_collection_wrap(name=c_name, priamry_field=ct.default_int64_field_name,
                                                 check_task=CheckTasks.check_collection_property,
                                                 check_items={exp_name: c_name, exp_schema: default_schema})
        df = cf.gen_default_dataframe_data(ct.default_nb)
-        self.collection_wrap.construct_from_dataframe(c_name, df, check_task=CheckTasks.check_collection_property,
+        self.collection_wrap.construct_from_dataframe(c_name, df, priamry_field=ct.default_int64_field_name,
+                                                      check_task=CheckTasks.check_collection_property,
                                                      check_items={exp_name: c_name, exp_schema: default_schema})
        conn.flush([collection_w.name])
        assert collection_w.num_entities == ct.default_nb