New Module: gcp_tpu_node (#57729)
This commit is contained in:
parent
61ff61cdf6
commit
625ef22dd9
5 changed files with 593 additions and 0 deletions
469
lib/ansible/modules/cloud/google/gcp_tpu_node.py
Normal file
469
lib/ansible/modules/cloud/google/gcp_tpu_node.py
Normal file
|
@ -0,0 +1,469 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright (C) 2017 Google
|
||||
# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt)
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# *** AUTO GENERATED CODE *** AUTO GENERATED CODE ***
|
||||
#
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# This file is automatically generated by Magic Modules and manual
|
||||
# changes will be clobbered when the file is regenerated.
|
||||
#
|
||||
# Please read more about how to change this file at
|
||||
# https://www.github.com/GoogleCloudPlatform/magic-modules
|
||||
#
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
__metaclass__ = type
|
||||
|
||||
################################################################################
|
||||
# Documentation
|
||||
################################################################################
|
||||
|
||||
ANSIBLE_METADATA = {'metadata_version': '1.1', 'status': ["preview"], 'supported_by': 'community'}
|
||||
|
||||
DOCUMENTATION = '''
|
||||
---
|
||||
module: gcp_tpu_node
|
||||
description:
|
||||
- A Cloud TPU instance.
|
||||
short_description: Creates a GCP Node
|
||||
version_added: 2.9
|
||||
author: Google Inc. (@googlecloudplatform)
|
||||
requirements:
|
||||
- python >= 2.6
|
||||
- requests >= 2.18.4
|
||||
- google-auth >= 1.3.0
|
||||
options:
|
||||
state:
|
||||
description:
|
||||
- Whether the given object should exist in GCP
|
||||
choices:
|
||||
- present
|
||||
- absent
|
||||
default: present
|
||||
name:
|
||||
description:
|
||||
- The immutable name of the TPU.
|
||||
required: true
|
||||
description:
|
||||
description:
|
||||
- The user-supplied description of the TPU. Maximum of 512 characters.
|
||||
required: false
|
||||
accelerator_type:
|
||||
description:
|
||||
- The type of hardware accelerators associated with this node.
|
||||
required: true
|
||||
tensorflow_version:
|
||||
description:
|
||||
- The version of Tensorflow running in the Node.
|
||||
required: true
|
||||
network:
|
||||
description:
|
||||
- The name of a network to peer the TPU node to. It must be a preexisting Compute
|
||||
Engine network inside of the project on which this API has been activated. If
|
||||
none is provided, "default" will be used.
|
||||
required: false
|
||||
cidr_block:
|
||||
description:
|
||||
- The CIDR block that the TPU node will use when selecting an IP address. This
|
||||
CIDR block must be a /29 block; the Compute Engine networks API forbids a smaller
|
||||
block, and using a larger block would be wasteful (a node can only consume one
|
||||
IP address).
|
||||
- Errors will occur if the CIDR block has already been used for a currently existing
|
||||
TPU node, the CIDR block conflicts with any subnetworks in the user's provided
|
||||
network, or the provided network is peered with another network that is using
|
||||
that CIDR block.
|
||||
required: true
|
||||
scheduling_config:
|
||||
description:
|
||||
- Sets the scheduling options for this TPU instance.
|
||||
required: false
|
||||
suboptions:
|
||||
preemptible:
|
||||
description:
|
||||
- Defines whether the TPU instance is preemptible.
|
||||
required: false
|
||||
default: 'false'
|
||||
type: bool
|
||||
labels:
|
||||
description:
|
||||
- Resource labels to represent user provided metadata.
|
||||
required: false
|
||||
zone:
|
||||
description:
|
||||
- The GCP location for the TPU.
|
||||
required: true
|
||||
extends_documentation_fragment: gcp
|
||||
notes:
|
||||
- 'API Reference: U(https://cloud.google.com/tpu/docs/reference/rest/)'
|
||||
- 'Official Documentation: U(https://cloud.google.com/tpu/docs/)'
|
||||
'''
|
||||
|
||||
EXAMPLES = '''
|
||||
- name: create a node
|
||||
gcp_tpu_node:
|
||||
name: test_object
|
||||
zone: us-central1-b
|
||||
accelerator_type: v3-8
|
||||
tensorflow_version: '1.11'
|
||||
cidr_block: 10.2.0.0/29
|
||||
project: test_project
|
||||
auth_kind: serviceaccount
|
||||
service_account_file: "/tmp/auth.pem"
|
||||
state: present
|
||||
'''
|
||||
|
||||
RETURN = '''
|
||||
name:
|
||||
description:
|
||||
- The immutable name of the TPU.
|
||||
returned: success
|
||||
type: str
|
||||
description:
|
||||
description:
|
||||
- The user-supplied description of the TPU. Maximum of 512 characters.
|
||||
returned: success
|
||||
type: str
|
||||
acceleratorType:
|
||||
description:
|
||||
- The type of hardware accelerators associated with this node.
|
||||
returned: success
|
||||
type: str
|
||||
tensorflowVersion:
|
||||
description:
|
||||
- The version of Tensorflow running in the Node.
|
||||
returned: success
|
||||
type: str
|
||||
network:
|
||||
description:
|
||||
- The name of a network to peer the TPU node to. It must be a preexisting Compute
|
||||
Engine network inside of the project on which this API has been activated. If
|
||||
none is provided, "default" will be used.
|
||||
returned: success
|
||||
type: str
|
||||
cidrBlock:
|
||||
description:
|
||||
- The CIDR block that the TPU node will use when selecting an IP address. This CIDR
|
||||
block must be a /29 block; the Compute Engine networks API forbids a smaller block,
|
||||
and using a larger block would be wasteful (a node can only consume one IP address).
|
||||
- Errors will occur if the CIDR block has already been used for a currently existing
|
||||
TPU node, the CIDR block conflicts with any subnetworks in the user's provided
|
||||
network, or the provided network is peered with another network that is using
|
||||
that CIDR block.
|
||||
returned: success
|
||||
type: str
|
||||
serviceAccount:
|
||||
description:
|
||||
- The service account used to run the tensor flow services within the node. To share
|
||||
resources, including Google Cloud Storage data, with the Tensorflow job running
|
||||
in the Node, this account must have permissions to that data.
|
||||
returned: success
|
||||
type: str
|
||||
schedulingConfig:
|
||||
description:
|
||||
- Sets the scheduling options for this TPU instance.
|
||||
returned: success
|
||||
type: complex
|
||||
contains:
|
||||
preemptible:
|
||||
description:
|
||||
- Defines whether the TPU instance is preemptible.
|
||||
returned: success
|
||||
type: bool
|
||||
networkEndpoints:
|
||||
description:
|
||||
- The network endpoints where TPU workers can be accessed and sent work.
|
||||
- It is recommended that Tensorflow clients of the node first reach out to the first
|
||||
(index 0) entry.
|
||||
returned: success
|
||||
type: complex
|
||||
contains:
|
||||
ipAddress:
|
||||
description:
|
||||
- The IP address of this network endpoint.
|
||||
returned: success
|
||||
type: str
|
||||
port:
|
||||
description:
|
||||
- The port of this network endpoint.
|
||||
returned: success
|
||||
type: int
|
||||
labels:
|
||||
description:
|
||||
- Resource labels to represent user provided metadata.
|
||||
returned: success
|
||||
type: dict
|
||||
zone:
|
||||
description:
|
||||
- The GCP location for the TPU.
|
||||
returned: success
|
||||
type: str
|
||||
'''
|
||||
|
||||
################################################################################
|
||||
# Imports
|
||||
################################################################################
|
||||
|
||||
from ansible.module_utils.gcp_utils import navigate_hash, GcpSession, GcpModule, GcpRequest, remove_nones_from_dict, replace_resource_dict
|
||||
import json
|
||||
import time
|
||||
|
||||
################################################################################
|
||||
# Main
|
||||
################################################################################
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
|
||||
module = GcpModule(
|
||||
argument_spec=dict(
|
||||
state=dict(default='present', choices=['present', 'absent'], type='str'),
|
||||
name=dict(required=True, type='str'),
|
||||
description=dict(type='str'),
|
||||
accelerator_type=dict(required=True, type='str'),
|
||||
tensorflow_version=dict(required=True, type='str'),
|
||||
network=dict(type='str'),
|
||||
cidr_block=dict(required=True, type='str'),
|
||||
scheduling_config=dict(type='dict', options=dict(preemptible=dict(type='bool'))),
|
||||
labels=dict(type='dict'),
|
||||
zone=dict(required=True, type='str'),
|
||||
)
|
||||
)
|
||||
|
||||
if not module.params['scopes']:
|
||||
module.params['scopes'] = ['https://www.googleapis.com/auth/cloud-platform']
|
||||
|
||||
state = module.params['state']
|
||||
|
||||
fetch = fetch_resource(module, self_link(module))
|
||||
changed = False
|
||||
|
||||
if fetch:
|
||||
if state == 'present':
|
||||
if is_different(module, fetch):
|
||||
update(module, self_link(module), fetch)
|
||||
fetch = fetch_resource(module, self_link(module))
|
||||
changed = True
|
||||
else:
|
||||
delete(module, self_link(module))
|
||||
fetch = {}
|
||||
changed = True
|
||||
else:
|
||||
if state == 'present':
|
||||
fetch = create(module, create_link(module))
|
||||
changed = True
|
||||
else:
|
||||
fetch = {}
|
||||
|
||||
fetch.update({'changed': changed})
|
||||
|
||||
module.exit_json(**fetch)
|
||||
|
||||
|
||||
def create(module, link):
|
||||
auth = GcpSession(module, 'tpu')
|
||||
return wait_for_operation(module, auth.post(link, resource_to_request(module)))
|
||||
|
||||
|
||||
def update(module, link, fetch):
|
||||
update_fields(module, resource_to_request(module), response_to_hash(module, fetch))
|
||||
return fetch_resource(module, self_link(module))
|
||||
|
||||
|
||||
def update_fields(module, request, response):
|
||||
if response.get('tensorflowVersion') != request.get('tensorflowVersion'):
|
||||
tensorflow_version_update(module, request, response)
|
||||
|
||||
|
||||
def tensorflow_version_update(module, request, response):
|
||||
auth = GcpSession(module, 'tpu')
|
||||
auth.post(
|
||||
''.join(["https://tpu.googleapis.com/v1/", "projects/{project}/locations/{zone}/nodes/{name}:reimage"]).format(**module.params),
|
||||
{u'tensorflowVersion': module.params.get('tensorflow_version')},
|
||||
)
|
||||
|
||||
|
||||
def delete(module, link):
|
||||
auth = GcpSession(module, 'tpu')
|
||||
return wait_for_operation(module, auth.delete(link))
|
||||
|
||||
|
||||
def resource_to_request(module):
|
||||
request = {
|
||||
u'name': module.params.get('name'),
|
||||
u'description': module.params.get('description'),
|
||||
u'acceleratorType': module.params.get('accelerator_type'),
|
||||
u'tensorflowVersion': module.params.get('tensorflow_version'),
|
||||
u'network': module.params.get('network'),
|
||||
u'cidrBlock': module.params.get('cidr_block'),
|
||||
u'schedulingConfig': NodeSchedulingconfig(module.params.get('scheduling_config', {}), module).to_request(),
|
||||
u'labels': module.params.get('labels'),
|
||||
}
|
||||
return_vals = {}
|
||||
for k, v in request.items():
|
||||
if v or v is False:
|
||||
return_vals[k] = v
|
||||
|
||||
return return_vals
|
||||
|
||||
|
||||
def fetch_resource(module, link, allow_not_found=True):
|
||||
auth = GcpSession(module, 'tpu')
|
||||
return return_if_object(module, auth.get(link), allow_not_found)
|
||||
|
||||
|
||||
def self_link(module):
|
||||
return "https://tpu.googleapis.com/v1/projects/{project}/locations/{zone}/nodes/{name}".format(**module.params)
|
||||
|
||||
|
||||
def collection(module):
|
||||
return "https://tpu.googleapis.com/v1/projects/{project}/locations/{zone}/nodes".format(**module.params)
|
||||
|
||||
|
||||
def create_link(module):
|
||||
return "https://tpu.googleapis.com/v1/projects/{project}/locations/{zone}/nodes?nodeId={name}".format(**module.params)
|
||||
|
||||
|
||||
def return_if_object(module, response, allow_not_found=False):
|
||||
# If not found, return nothing.
|
||||
if allow_not_found and response.status_code == 404:
|
||||
return None
|
||||
|
||||
# If no content, return nothing.
|
||||
if response.status_code == 204:
|
||||
return None
|
||||
|
||||
try:
|
||||
module.raise_for_status(response)
|
||||
result = response.json()
|
||||
except getattr(json.decoder, 'JSONDecodeError', ValueError):
|
||||
module.fail_json(msg="Invalid JSON response with error: %s" % response.text)
|
||||
|
||||
if navigate_hash(result, ['error', 'errors']):
|
||||
module.fail_json(msg=navigate_hash(result, ['error', 'errors']))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def is_different(module, response):
|
||||
request = resource_to_request(module)
|
||||
response = response_to_hash(module, response)
|
||||
|
||||
# Remove all output-only from response.
|
||||
response_vals = {}
|
||||
for k, v in response.items():
|
||||
if k in request:
|
||||
response_vals[k] = v
|
||||
|
||||
request_vals = {}
|
||||
for k, v in request.items():
|
||||
if k in response:
|
||||
request_vals[k] = v
|
||||
|
||||
return GcpRequest(request_vals) != GcpRequest(response_vals)
|
||||
|
||||
|
||||
# Remove unnecessary properties from the response.
|
||||
# This is for doing comparisons with Ansible's current parameters.
|
||||
def response_to_hash(module, response):
|
||||
return {
|
||||
u'name': module.params.get('name'),
|
||||
u'description': module.params.get('description'),
|
||||
u'acceleratorType': module.params.get('accelerator_type'),
|
||||
u'tensorflowVersion': response.get(u'tensorflowVersion'),
|
||||
u'network': module.params.get('network'),
|
||||
u'cidrBlock': module.params.get('cidr_block'),
|
||||
u'serviceAccount': response.get(u'serviceAccount'),
|
||||
u'schedulingConfig': NodeSchedulingconfig(module.params.get('scheduling_config', {}), module).to_request(),
|
||||
u'networkEndpoints': NodeNetworkendpointsArray(response.get(u'networkEndpoints', []), module).from_response(),
|
||||
u'labels': module.params.get('labels'),
|
||||
}
|
||||
|
||||
|
||||
def async_op_url(module, extra_data=None):
|
||||
if extra_data is None:
|
||||
extra_data = {}
|
||||
url = "https://tpu.googleapis.com/v1/{op_id}"
|
||||
combined = extra_data.copy()
|
||||
combined.update(module.params)
|
||||
return url.format(**combined)
|
||||
|
||||
|
||||
def wait_for_operation(module, response):
|
||||
op_result = return_if_object(module, response)
|
||||
if op_result is None:
|
||||
return {}
|
||||
status = navigate_hash(op_result, ['done'])
|
||||
wait_done = wait_for_completion(status, op_result, module)
|
||||
raise_if_errors(op_result, ['error'], module)
|
||||
return navigate_hash(wait_done, ['response'])
|
||||
|
||||
|
||||
def wait_for_completion(status, op_result, module):
|
||||
op_id = navigate_hash(op_result, ['name'])
|
||||
op_uri = async_op_url(module, {'op_id': op_id})
|
||||
while not status:
|
||||
raise_if_errors(op_result, ['error'], module)
|
||||
time.sleep(1.0)
|
||||
op_result = fetch_resource(module, op_uri, False)
|
||||
status = navigate_hash(op_result, ['done'])
|
||||
return op_result
|
||||
|
||||
|
||||
def raise_if_errors(response, err_path, module):
|
||||
errors = navigate_hash(response, err_path)
|
||||
if errors is not None:
|
||||
module.fail_json(msg=errors)
|
||||
|
||||
|
||||
class NodeSchedulingconfig(object):
|
||||
def __init__(self, request, module):
|
||||
self.module = module
|
||||
if request:
|
||||
self.request = request
|
||||
else:
|
||||
self.request = {}
|
||||
|
||||
def to_request(self):
|
||||
return remove_nones_from_dict({u'preemptible': self.request.get('preemptible')})
|
||||
|
||||
def from_response(self):
|
||||
return remove_nones_from_dict({u'preemptible': self.request.get(u'preemptible')})
|
||||
|
||||
|
||||
class NodeNetworkendpointsArray(object):
|
||||
def __init__(self, request, module):
|
||||
self.module = module
|
||||
if request:
|
||||
self.request = request
|
||||
else:
|
||||
self.request = []
|
||||
|
||||
def to_request(self):
|
||||
items = []
|
||||
for item in self.request:
|
||||
items.append(self._request_for_item(item))
|
||||
return items
|
||||
|
||||
def from_response(self):
|
||||
items = []
|
||||
for item in self.request:
|
||||
items.append(self._response_from_item(item))
|
||||
return items
|
||||
|
||||
def _request_for_item(self, item):
|
||||
return remove_nones_from_dict({})
|
||||
|
||||
def _response_from_item(self, item):
|
||||
return remove_nones_from_dict({})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
2
test/integration/targets/gcp_tpu_node/aliases
Normal file
2
test/integration/targets/gcp_tpu_node/aliases
Normal file
|
@ -0,0 +1,2 @@
|
|||
cloud/gcp
|
||||
unsupported
|
2
test/integration/targets/gcp_tpu_node/defaults/main.yml
Normal file
2
test/integration/targets/gcp_tpu_node/defaults/main.yml
Normal file
|
@ -0,0 +1,2 @@
|
|||
---
|
||||
resource_name: "{{ resource_prefix }}"
|
0
test/integration/targets/gcp_tpu_node/meta/main.yml
Normal file
0
test/integration/targets/gcp_tpu_node/meta/main.yml
Normal file
120
test/integration/targets/gcp_tpu_node/tasks/main.yml
Normal file
120
test/integration/targets/gcp_tpu_node/tasks/main.yml
Normal file
|
@ -0,0 +1,120 @@
|
|||
---
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# *** AUTO GENERATED CODE *** AUTO GENERATED CODE ***
|
||||
#
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# This file is automatically generated by Magic Modules and manual
|
||||
# changes will be clobbered when the file is regenerated.
|
||||
#
|
||||
# Please read more about how to change this file at
|
||||
# https://www.github.com/GoogleCloudPlatform/magic-modules
|
||||
#
|
||||
# ----------------------------------------------------------------------------
|
||||
# Pre-test setup
|
||||
- name: delete a node
|
||||
gcp_tpu_node:
|
||||
name: "{{ resource_name }}"
|
||||
zone: us-central1-b
|
||||
accelerator_type: v3-8
|
||||
tensorflow_version: '1.11'
|
||||
cidr_block: 10.2.0.0/29
|
||||
project: "{{ gcp_project }}"
|
||||
auth_kind: "{{ gcp_cred_kind }}"
|
||||
service_account_file: "{{ gcp_cred_file }}"
|
||||
state: absent
|
||||
#----------------------------------------------------------
|
||||
- name: create a node
|
||||
gcp_tpu_node:
|
||||
name: "{{ resource_name }}"
|
||||
zone: us-central1-b
|
||||
accelerator_type: v3-8
|
||||
tensorflow_version: '1.11'
|
||||
cidr_block: 10.2.0.0/29
|
||||
project: "{{ gcp_project }}"
|
||||
auth_kind: "{{ gcp_cred_kind }}"
|
||||
service_account_file: "{{ gcp_cred_file }}"
|
||||
state: present
|
||||
register: result
|
||||
- name: assert changed is true
|
||||
assert:
|
||||
that:
|
||||
- result.changed == true
|
||||
- name: verify that node was created
|
||||
gcp_tpu_node_facts:
|
||||
zone: us-central1-b
|
||||
project: "{{ gcp_project }}"
|
||||
auth_kind: "{{ gcp_cred_kind }}"
|
||||
service_account_file: "{{ gcp_cred_file }}"
|
||||
scopes:
|
||||
- https://www.googleapis.com/auth/cloud-platform
|
||||
register: results
|
||||
- name: verify that command succeeded
|
||||
assert:
|
||||
that:
|
||||
- results['resources'] | length >= 1
|
||||
# ----------------------------------------------------------------------------
|
||||
- name: create a node that already exists
|
||||
gcp_tpu_node:
|
||||
name: "{{ resource_name }}"
|
||||
zone: us-central1-b
|
||||
accelerator_type: v3-8
|
||||
tensorflow_version: '1.11'
|
||||
cidr_block: 10.2.0.0/29
|
||||
project: "{{ gcp_project }}"
|
||||
auth_kind: "{{ gcp_cred_kind }}"
|
||||
service_account_file: "{{ gcp_cred_file }}"
|
||||
state: present
|
||||
register: result
|
||||
- name: assert changed is false
|
||||
assert:
|
||||
that:
|
||||
- result.changed == false
|
||||
#----------------------------------------------------------
|
||||
- name: delete a node
|
||||
gcp_tpu_node:
|
||||
name: "{{ resource_name }}"
|
||||
zone: us-central1-b
|
||||
accelerator_type: v3-8
|
||||
tensorflow_version: '1.11'
|
||||
cidr_block: 10.2.0.0/29
|
||||
project: "{{ gcp_project }}"
|
||||
auth_kind: "{{ gcp_cred_kind }}"
|
||||
service_account_file: "{{ gcp_cred_file }}"
|
||||
state: absent
|
||||
register: result
|
||||
- name: assert changed is true
|
||||
assert:
|
||||
that:
|
||||
- result.changed == true
|
||||
- name: verify that node was deleted
|
||||
gcp_tpu_node_facts:
|
||||
zone: us-central1-b
|
||||
project: "{{ gcp_project }}"
|
||||
auth_kind: "{{ gcp_cred_kind }}"
|
||||
service_account_file: "{{ gcp_cred_file }}"
|
||||
scopes:
|
||||
- https://www.googleapis.com/auth/cloud-platform
|
||||
register: results
|
||||
- name: verify that command succeeded
|
||||
assert:
|
||||
that:
|
||||
- results['resources'] | length == 0
|
||||
# ----------------------------------------------------------------------------
|
||||
- name: delete a node that does not exist
|
||||
gcp_tpu_node:
|
||||
name: "{{ resource_name }}"
|
||||
zone: us-central1-b
|
||||
accelerator_type: v3-8
|
||||
tensorflow_version: '1.11'
|
||||
cidr_block: 10.2.0.0/29
|
||||
project: "{{ gcp_project }}"
|
||||
auth_kind: "{{ gcp_cred_kind }}"
|
||||
service_account_file: "{{ gcp_cred_file }}"
|
||||
state: absent
|
||||
register: result
|
||||
- name: assert changed is false
|
||||
assert:
|
||||
that:
|
||||
- result.changed == false
|
Loading…
Reference in a new issue