[skip e2e]Add all pods kill chaos test (#15761)

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
zhuwenxing 2022-02-25 18:41:53 +08:00 committed by GitHub
parent 51cac044aa
commit b745b6f707
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 199 additions and 31 deletions

View File

@ -13,7 +13,7 @@ jobs:
strategy:
fail-fast: false
matrix:
pod: [standalone, datacoord, datanode, indexcoord, indexnode, proxy, pulsar, querycoord, querynode, rootcoord, etcd, minio]
pod: [allstandalone, allcluster, standalone, datacoord, datanode, indexcoord, indexnode, proxy, pulsar, querycoord, querynode, rootcoord, etcd, minio]
steps:
@ -68,8 +68,8 @@ jobs:
bash ../../../scripts/docker_image_find_tag.sh -n milvusdb/milvus-dev -t master-latest -f master- -F -L -q
helm repo add milvus https://milvus-io.github.io/milvus-helm
helm repo update
if [ ${{ matrix.pod }} != "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f cluster-values.yaml -n=chaos-testing; fi
if [ ${{ matrix.pod }} == "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f standalone-values.yaml -n=chaos-testing; fi
if [[ ${{ matrix.pod }} != *"standalone"* ]]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f cluster-values.yaml -n=chaos-testing; fi
if [[ ${{ matrix.pod }} == *"standalone"* ]]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f standalone-values.yaml -n=chaos-testing; fi
kubectl get pods -n chaos-testing
sleep 20s
kubectl get pods -n chaos-testing

View File

@ -0,0 +1,55 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: Workflow
metadata:
name: test-allcluster-pod-kill
namespace: chaos-testing
spec:
entry: entry
templates:
- name: entry
templateType: Serial
deadline: 5m
children:
- test-all-pods-kill
- name: test-first-part-pod-kill
templateType: Schedule
deadline: 3m
schedule:
schedule: '*/5 * * * * *'
startingDeadlineSeconds: 60
concurrencyPolicy: Forbid
historyLimit: 1
type: PodChaos
podChaos:
selector:
namespaces:
- chaos-testing
labelSelectors:
release: milvus-chaos
mode: all
action: pod-kill
gracePeriod: 0
- name: test-second-part-pod-kill
templateType: Schedule
deadline: 3m
schedule:
schedule: '*/5 * * * * *'
startingDeadlineSeconds: 60
concurrencyPolicy: Forbid
historyLimit: 1
type: PodChaos
podChaos:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all
action: pod-kill
gracePeriod: 0
- name: test-all-pods-kill
templateType: Parallel
deadline: 3m
children:
- test-first-part-pod-kill
- test-second-part-pod-kill

View File

@ -0,0 +1,55 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: Workflow
metadata:
name: test-allstandalone-pod-kill
namespace: chaos-testing
spec:
entry: entry
templates:
- name: entry
templateType: Serial
deadline: 5m
children:
- test-all-pods-kill
- name: test-first-part-pod-kill
templateType: Schedule
deadline: 3m
schedule:
schedule: '*/5 * * * * *'
startingDeadlineSeconds: 60
concurrencyPolicy: Forbid
historyLimit: 1
type: PodChaos
podChaos:
selector:
namespaces:
- chaos-testing
labelSelectors:
release: milvus-chaos
mode: all
action: pod-kill
gracePeriod: 0
- name: test-second-part-pod-kill
templateType: Schedule
deadline: 3m
schedule:
schedule: '*/5 * * * * *'
startingDeadlineSeconds: 60
concurrencyPolicy: Forbid
historyLimit: 1
type: PodChaos
podChaos:
selector:
namespaces:
- chaos-testing
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
mode: all
action: pod-kill
gracePeriod: 0
- name: test-all-pods-kill
templateType: Parallel
deadline: 3m
children:
- test-first-part-pod-kill
- test-second-part-pod-kill

View File

@ -145,3 +145,27 @@ Collections:
index: fail
search: fail
query: fail
-
testcase:
name: test_allstandalone_pod_kill
chaos: chaos_allstandalone_pod_kill.yaml
expectation:
cluster_1_node:
create: fail
insert: fail
flush: fail
index: fail
search: fail
query: fail
-
testcase:
name: test_allcluster_pod_kill
chaos: chaos_allcluster_pod_kill.yaml
expectation:
cluster_1_node:
create: fail
insert: fail
flush: fail
index: fail
search: fail
query: fail

View File

@ -40,16 +40,16 @@ bash uninstall_milvus.sh ${release} ${ns}|| true
declare -A pod_map=(["querynode"]="queryNode" ["indexnode"]="indexNode" ["datanode"]="dataNode" ["proxy"]="proxy")
echo "install milvus"
if [ ${pod} != "standalone" ];
if [[ ${pod} != *"standalone"* ]];
then
echo "insatll cluster"
helm install --wait --timeout 360s ${release} milvus/milvus --set image.all.repository=${REPOSITORY:-"milvusdb/milvus-dev"} --set image.all.tag=${IMAGE_TAG:-"master-latest"} --set ${pod_map[${pod}]}.replicas=$node_num -f ../cluster-values.yaml -n=${ns}
helm install --wait --timeout 360s ${release} milvus/milvus --set ${pod_map[${pod}]}.replicas=$node_num -f ../cluster-values.yaml -n=${ns}
fi
if [ ${pod} == "standalone" ];
if [[ ${pod} == *"standalone"* ]];
then
echo "install standalone"
helm install --wait --timeout 360s ${release} milvus/milvus --set image.all.repository=${REPOSITORY:-"milvusdb/milvus-dev"} --set image.all.tag=${IMAGE_TAG:-"master-latest"} -f ../standalone-values.yaml -n=${ns}
helm install --wait --timeout 360s ${release} milvus/milvus -f ../standalone-values.yaml -n=${ns}
fi
# wait all pod ready

View File

@ -17,8 +17,8 @@ ENTITIES_FOR_SEARCH = 3000 # entities for search_collection
CHAOS_CONFIG_ENV = 'CHAOS_CONFIG_PATH' # env variables for chao path
TESTS_CONFIG_LOCATION = 'chaos_objects/pod_kill/'
ALL_CHAOS_YAMLS = 'chaos_querynode_pod_kill.yaml'
RELEASE_NAME = 'test-querynode-pod-kill-17-33-50'
ALL_CHAOS_YAMLS = 'chaos_allstandalone_pod_kill.yaml'
RELEASE_NAME = 'test-allstandalone-pod-kill-19-25-26'
WAIT_PER_OP = 10 # time to wait in seconds between operations
CHAOS_DURATION = 120 # chaos duration time in seconds
DEFAULT_INDEX_PARAM = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}

View File

@ -12,6 +12,7 @@ from chaos.checker import (CreateChecker, InsertFlushChecker,
from common.cus_resource_opts import CustomResourceOperations as CusResource
from utils.util_log import test_log as log
from utils.util_k8s import wait_pods_ready, get_pod_list
from utils.util_common import findkeys
from chaos import chaos_commons as cc
from common.common_type import CaseLabel
from chaos import constants
@ -34,6 +35,27 @@ def assert_statistic(checkers, expectations={}):
f"Expect Succ: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
def check_cluster_nodes(chaos_config):
# if all pods will be effected, the expect is all fail.
# Even though the replicas is greater than 1, it can not provide HA, so cluster_nodes is set as 1 for this situation.
if "all" in chaos_config["metadata"]["name"]:
return 1
selector = findkeys(chaos_config, "selector")
selector = list(selector)
log.info(f"chaos target selector: {selector}")
assert len(selector) == 1
selector = selector[0]
namespace = selector["namespaces"][0]
labels_dict = selector["labelSelectors"]
labels_list = []
for k,v in labels_dict.items():
labels_list.append(k+"="+v)
labels_str = ",".join(labels_list)
pods = get_pod_list(namespace, labels_str)
return len(pods)
def record_results(checkers):
res = ""
for k in checkers.keys():
@ -57,17 +79,7 @@ class TestChaosBase:
health_checkers = {}
def parser_testcase_config(self, chaos_yaml, chaos_config):
# TODO: need a better way (maybe recursion) to parse chaos_config
# selector key is located in different depth when chaos config's kind is different
# for now, there are two kinds of chaos config: xxChaos and Schedule(applied in pod kill chaos).
if chaos_config["kind"] == "Schedule":
for k, v in chaos_config["spec"].items():
if "Chaos" in k and "selector" in v.keys():
selector = v["selector"]
break
else:
selector = chaos_config["spec"]["selector"]
log.info(f"chaos target selector: {selector}")
cluster_nodes = check_cluster_nodes(chaos_config)
tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml'
tests_config = cc.gen_experiment_config(tests_yaml)
test_collections = tests_config.get('Collections', None)
@ -75,16 +87,8 @@ class TestChaosBase:
test_chaos = t.get('testcase', {}).get('chaos', {})
if test_chaos in chaos_yaml:
expects = t.get('testcase', {}).get('expectation', {}).get('cluster_1_node', {})
# get the nums of pods
namespace = selector["namespaces"][0]
labels_dict = selector["labelSelectors"]
labels_list = []
for k,v in labels_dict.items():
labels_list.append(k+"="+v)
labels_str = ",".join(labels_list)
pods = get_pod_list(namespace, labels_str)
# for the cluster_n_node
if len(pods) > 1:
if cluster_nodes > 1:
expects = t.get('testcase', {}).get('expectation', {}).get('cluster_n_node', {})
log.info(f"yaml.expects: {expects}")
self.expect_create = expects.get(Op.create.value, constants.SUCC)

View File

@ -13,8 +13,8 @@ class CustomResourceOperations(object):
self.group = group
self.version = version
self.namespace = namespace
if kind.lower() == "schedule":
self.plural = "schedules"
if kind.lower()[-1] != "s":
self.plural = kind.lower() + "s"
else:
self.plural = kind.lower()

View File

@ -0,0 +1,30 @@
def findkeys(node, kv):
# refer to https://stackoverflow.com/questions/9807634/find-all-occurrences-of-a-key-in-nested-dictionaries-and-lists
if isinstance(node, list):
for i in node:
for x in findkeys(i, kv):
yield x
elif isinstance(node, dict):
if kv in node:
yield node[kv]
for j in node.values():
for x in findkeys(j, kv):
yield x
if __name__ == "__main__":
d = { "id" : "abcde",
"key1" : "blah",
"key2" : "blah blah",
"nestedlist" : [
{ "id" : "qwerty",
"nestednestedlist" : [
{ "id" : "xyz", "keyA" : "blah blah blah" },
{ "id" : "fghi", "keyZ" : "blah blah blah" }],
"anothernestednestedlist" : [
{ "id" : "asdf", "keyQ" : "blah blah" },
{ "id" : "yuiop", "keyW" : "blah" }] } ] }
print(list(findkeys(d, 'id')))