[skip e2e]Add network latency chaos test (#15901)

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
zhuwenxing 2022-03-07 16:18:03 +08:00 committed by GitHub
parent c808862242
commit be30ccfac9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 609 additions and 0 deletions

View File

@ -0,0 +1,197 @@
name: Network Latency Chaos Test
on:
workflow_dispatch:
jobs:
test-network-latency-chaos:
runs-on: ubuntu-latest
timeout-minutes: 40
strategy:
fail-fast: false
matrix:
pod: [datacoord, datanode, indexcoord, indexnode, proxy, pulsar, querycoord, querynode, rootcoord, etcd, minio]
steps:
- name: Set env param
run: |
echo "RELEASE=test-${{ matrix.pod }}-network-latency" >> $GITHUB_ENV
- name: Creating kind cluster
uses: helm/kind-action@v1.2.0
- name: Print cluster information
run: |
kubectl config view
kubectl cluster-info
kubectl get nodes
kubectl get pods -n kube-system
helm version
kubectl version
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Install dependency
uses: nick-invision/retry@v2
with:
timeout_minutes: 5
max_attempts: 3
retry_on: error
shell: bash
command: |
pip install -r tests/python_client/requirements.txt --trusted-host https://test.pypi.org
pip install --upgrade protobuf
- name: Deploy Chaos Mesh
shell: bash
run: |
helm repo add chaos-mesh https://charts.chaos-mesh.org
helm search repo chaos-mesh
kubectl create ns chaos-testing
helm install --wait --timeout 360s chaos-mesh chaos-mesh/chaos-mesh --namespace=chaos-testing --version v2.0.3 --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock
kubectl get po -n chaos-testing
- name: Deploy Milvus
shell: bash
working-directory: tests/python_client/chaos
run: |
echo "latest tag:"
bash ../../../scripts/docker_image_find_tag.sh -n milvusdb/milvus-dev -t master-latest -f master- -F -L -q
helm repo add milvus https://milvus-io.github.io/milvus-helm
helm repo update
if [[ ${{ matrix.pod }} != *"standalone"* ]]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f cluster-values.yaml -n=chaos-testing; fi
if [[ ${{ matrix.pod }} == *"standalone"* ]]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f standalone-values.yaml -n=chaos-testing; fi
kubectl get pods -n chaos-testing
sleep 20s
kubectl get pods -n chaos-testing
kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 &
sleep 20s
# check whether port-forward success
nc -vz 127.0.0.1 19530
# check whether milvus server is healthy
python scripts/hello_milvus.py
- name: Chaos Test
timeout-minutes: 15
shell: bash
working-directory: tests/python_client/chaos
run: |
# replace chaos object
sed -i "s/TESTS_CONFIG_LOCATION =.*/TESTS_CONFIG_LOCATION = \'chaos_objects\/network_latency\/'/g" constants.py
sed -i "s/ALL_CHAOS_YAMLS =.*/ALL_CHAOS_YAMLS = \'chaos_${{ matrix.pod }}_network_latency.yaml\'/g" constants.py
sed -i "s/RELEASE_NAME =.*/RELEASE_NAME = \'${{ env.RELEASE }}\'/g" constants.py
cat constants.py
timeout 14m pytest -s -v test_chaos.py --host 127.0.0.1 --log-cli-level=INFO --capture=no || echo "chaos test failed"
- name: Result Analysis
timeout-minutes: 15
shell: bash
working-directory: tests/python_client/chaos/reports
run: |
echo "result analysis"
cat ${{ env.RELEASE }}.log || echo "no log file"
- name: Milvus E2E Test
timeout-minutes: 10
if: ${{ always() }}
shell: bash
working-directory: tests/python_client
run: |
kubectl get networkchaos -n chaos-testing
kubectl get pod -n chaos-testing
# wait all pod to be ready
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${{ env.RELEASE }} -n chaos-testing --timeout=360s
kubectl wait --for=condition=Ready pod -l release=${{ env.RELEASE }} -n chaos-testing --timeout=360s
kubectl get pod -n chaos-testing
ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9
kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 &
sleep 20s
nc -vz 127.0.0.1 19530
pytest -s -v testcases/test_e2e.py --host 127.0.0.1 --log-cli-level=INFO --capture=no
python chaos/scripts/hello_milvus.py --host 127.0.0.1
- name: Export logs
if: ${{ always() }}
shell: bash
working-directory: tests/python_client/chaos
run: |
#in this step, verify whether pod has been killed by pod's age
kubectl get po -n chaos-testing
# export k8s log for chaos mesh and milvus
bash ../../scripts/export_log_k8s.sh chaos-testing ${{ env.RELEASE }} k8s_logs/chaos-test
- name: Deploy Milvus Again If Previous E2E Test Failed
timeout-minutes: 15
if: ${{ failure() }}
shell: bash
working-directory: tests/python_client/chaos
run: |
kubectl config set-context --current --namespace=chaos-testing
bash scripts/uninstall_milvus.sh ${{ env.RELEASE }}
if [ ${{ matrix.pod }} != "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f cluster-values.yaml -n=chaos-testing; fi
if [ ${{ matrix.pod }} == "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus --set cluster.enabled=false --set etcd.replicaCount=1 --set minio.mode=standalone --set pulsar.enabled=false -n=chaos-testing; fi
kubectl get pods -n chaos-testing
sleep 20s
kubectl get pods -n chaos-testing
ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9
kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 &
sleep 20s
# check whether port-forward success
nc -vz 127.0.0.1 19530
# check whether milvus server is healthy
python scripts/hello_milvus.py
- name: Data Consist Test
timeout-minutes: 5
if: ${{ always() }}
shell: bash
working-directory: tests/python_client/chaos
run: |
pytest -s -v test_chaos_data_consist.py --host 127.0.0.1 --log-cli-level=INFO --capture=no || echo "data consist chaos test failed"
- name: Milvus E2E Test
timeout-minutes: 10
if: ${{ always() }}
shell: bash
working-directory: tests/python_client
run: |
kubectl get pod -n chaos-testing
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${{ env.RELEASE }} -n chaos-testing --timeout=360s
kubectl wait --for=condition=Ready pod -l release=${{ env.RELEASE }} -n chaos-testing --timeout=360s
kubectl get pod -n chaos-testing
ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9
kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 &
sleep 20s
nc -vz 127.0.0.1 19530
pytest -s -v testcases/test_e2e.py --host 127.0.0.1 --log-cli-level=INFO --capture=no
python chaos/scripts/hello_milvus.py --host 127.0.0.1
- name: Export logs
if: ${{ always() }}
shell: bash
working-directory: tests/python_client/chaos
run: |
#in this step, verify whether pod has been killed by pod's age
kubectl get po -n chaos-testing
# export k8s log for chaos mesh and milvus
bash ../../scripts/export_log_k8s.sh chaos-testing ${{ env.RELEASE }} k8s_logs/data-consist-test
bash ../../scripts/export_log_k8s.sh chaos-testing chaos-daemon k8s_logs/chaos-mesh-daemon
- name: Upload logs
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: logs-${{ matrix.pod }}
path: |
tests/python_client/chaos/k8s_logs
tests/python_client/chaos/reports

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-datacoord-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
component: datacoord
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-datanode-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
component: datanode
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-etcd-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
app.kubernetes.io/name: etcd
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-indexcoord-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
component: indexcoord
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-indexnode-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
component: indexnode
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-minio-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
release: milvus-chaos
app: minio
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-proxy-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
component: proxy
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-pulsar-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
release: milvus-chaos
app: pulsar
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-querycoord-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
component: querycoord
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-querynode-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
component: querynode
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,26 @@
kind: NetworkChaos
apiVersion: chaos-mesh.org/v1alpha1
metadata:
name: test-rootcoord-network-latency
namespace: chaos-testing
spec:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
component: rootcoord
mode: all
action: delay
delay:
latency: 200ms
correlation: '100'
jitter: 0ms
direction: both
target:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all

View File

@ -0,0 +1,126 @@
Collections:
-
testcase:
name: test_querynode_network_latency
chaos: chaos_querynode_network_latency.yaml
expectation:
cluster_1_node:
search: fail
query: fail
cluster_n_nodes:
search: degrade
query: degrade
-
testcase:
name: test_querycoord_network_latency
chaos: chaos_querycoord_network_latency.yaml
expectation:
cluster_1_node:
search: fail
query: fail
cluster_n_nodes:
search: degrade
query: degrade
-
testcase:
name: test_datanode_network_latency
chaos: chaos_datanode_network_latency.yaml
expectation:
cluster_1_node:
insert: succ
flush: fail
cluster_n_nodes:
insert: degrade
-
testcase:
name: test_datascoord_network_latency
chaos: chaos_datacoord_network_latency.yaml
expectation:
cluster_1_node:
insert: succ
flush: fail
cluster_n_nodes:
insert: degrade
-
testcase:
name: test_indexnode_network_latency
chaos: chaos_indexnode_network_latency.yaml
expectation:
cluster_1_node:
index: fail
cluster_n_nodes:
index: degrade
-
testcase:
name: test_indexcoord_network_latency
chaos: chaos_indexcoord_network_latency.yaml
expectation:
cluster_1_node:
index: fail
cluster_n_nodes:
insert: degrade
-
testcase:
name: test_proxy_network_latency
chaos: chaos_proxy_network_latency.yaml
expectation:
cluster_1_node:
create: fail
insert: fail
flush: fail
index: fail
search: fail
query: fail
cluster_n_nodes:
insert: fail
-
testcase:
name: test_rootcoord_network_latency
chaos: chaos_rootcoord_network_latency.yaml
expectation:
cluster_1_node:
create: fail
insert: fail
flush: fail
index: fail
search: fail
query: fail
cluster_n_nodes:
insert: degrade
-
testcase:
name: test_etcd_network_latency
chaos: chaos_etcd_network_latency.yaml
expectation:
cluster_1_node:
create: fail
insert: fail
flush: fail
index: fail
search: fail
query: fail
-
testcase:
name: test_minio_network_latency
chaos: chaos_minio_network_latency.yaml
expectation:
cluster_1_node:
create: fail
insert: fail
flush: fail
index: fail
search: fail
query: fail
-
testcase:
name: test_pulsar_network_latency
chaos: chaos_pulsar_network_latency.yaml
expectation:
cluster_1_node:
create: fail
insert: fail
flush: fail
index: fail
search: fail
query: fail