New Module: gcp_tpu_node (#57729)

This commit is contained in:
The Magician 2019-06-13 15:25:01 -07:00 committed by ansibot
parent 61ff61cdf6
commit 625ef22dd9
5 changed files with 593 additions and 0 deletions

View file

@ -0,0 +1,469 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2017 Google
# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt)
# ----------------------------------------------------------------------------
#
# *** AUTO GENERATED CODE *** AUTO GENERATED CODE ***
#
# ----------------------------------------------------------------------------
#
# This file is automatically generated by Magic Modules and manual
# changes will be clobbered when the file is regenerated.
#
# Please read more about how to change this file at
# https://www.github.com/GoogleCloudPlatform/magic-modules
#
# ----------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
__metaclass__ = type
################################################################################
# Documentation
################################################################################
ANSIBLE_METADATA = {'metadata_version': '1.1', 'status': ["preview"], 'supported_by': 'community'}
DOCUMENTATION = '''
---
module: gcp_tpu_node
description:
- A Cloud TPU instance.
short_description: Creates a GCP Node
version_added: 2.9
author: Google Inc. (@googlecloudplatform)
requirements:
- python >= 2.6
- requests >= 2.18.4
- google-auth >= 1.3.0
options:
state:
description:
- Whether the given object should exist in GCP
choices:
- present
- absent
default: present
name:
description:
- The immutable name of the TPU.
required: true
description:
description:
- The user-supplied description of the TPU. Maximum of 512 characters.
required: false
accelerator_type:
description:
- The type of hardware accelerators associated with this node.
required: true
tensorflow_version:
description:
- The version of Tensorflow running in the Node.
required: true
network:
description:
- The name of a network to peer the TPU node to. It must be a preexisting Compute
Engine network inside of the project on which this API has been activated. If
none is provided, "default" will be used.
required: false
cidr_block:
description:
- The CIDR block that the TPU node will use when selecting an IP address. This
CIDR block must be a /29 block; the Compute Engine networks API forbids a smaller
block, and using a larger block would be wasteful (a node can only consume one
IP address).
- Errors will occur if the CIDR block has already been used for a currently existing
TPU node, the CIDR block conflicts with any subnetworks in the user's provided
network, or the provided network is peered with another network that is using
that CIDR block.
required: true
scheduling_config:
description:
- Sets the scheduling options for this TPU instance.
required: false
suboptions:
preemptible:
description:
- Defines whether the TPU instance is preemptible.
required: false
default: 'false'
type: bool
labels:
description:
- Resource labels to represent user provided metadata.
required: false
zone:
description:
- The GCP location for the TPU.
required: true
extends_documentation_fragment: gcp
notes:
- 'API Reference: U(https://cloud.google.com/tpu/docs/reference/rest/)'
- 'Official Documentation: U(https://cloud.google.com/tpu/docs/)'
'''
EXAMPLES = '''
- name: create a node
gcp_tpu_node:
name: test_object
zone: us-central1-b
accelerator_type: v3-8
tensorflow_version: '1.11'
cidr_block: 10.2.0.0/29
project: test_project
auth_kind: serviceaccount
service_account_file: "/tmp/auth.pem"
state: present
'''
RETURN = '''
name:
description:
- The immutable name of the TPU.
returned: success
type: str
description:
description:
- The user-supplied description of the TPU. Maximum of 512 characters.
returned: success
type: str
acceleratorType:
description:
- The type of hardware accelerators associated with this node.
returned: success
type: str
tensorflowVersion:
description:
- The version of Tensorflow running in the Node.
returned: success
type: str
network:
description:
- The name of a network to peer the TPU node to. It must be a preexisting Compute
Engine network inside of the project on which this API has been activated. If
none is provided, "default" will be used.
returned: success
type: str
cidrBlock:
description:
- The CIDR block that the TPU node will use when selecting an IP address. This CIDR
block must be a /29 block; the Compute Engine networks API forbids a smaller block,
and using a larger block would be wasteful (a node can only consume one IP address).
- Errors will occur if the CIDR block has already been used for a currently existing
TPU node, the CIDR block conflicts with any subnetworks in the user's provided
network, or the provided network is peered with another network that is using
that CIDR block.
returned: success
type: str
serviceAccount:
description:
- The service account used to run the tensor flow services within the node. To share
resources, including Google Cloud Storage data, with the Tensorflow job running
in the Node, this account must have permissions to that data.
returned: success
type: str
schedulingConfig:
description:
- Sets the scheduling options for this TPU instance.
returned: success
type: complex
contains:
preemptible:
description:
- Defines whether the TPU instance is preemptible.
returned: success
type: bool
networkEndpoints:
description:
- The network endpoints where TPU workers can be accessed and sent work.
- It is recommended that Tensorflow clients of the node first reach out to the first
(index 0) entry.
returned: success
type: complex
contains:
ipAddress:
description:
- The IP address of this network endpoint.
returned: success
type: str
port:
description:
- The port of this network endpoint.
returned: success
type: int
labels:
description:
- Resource labels to represent user provided metadata.
returned: success
type: dict
zone:
description:
- The GCP location for the TPU.
returned: success
type: str
'''
################################################################################
# Imports
################################################################################
from ansible.module_utils.gcp_utils import navigate_hash, GcpSession, GcpModule, GcpRequest, remove_nones_from_dict, replace_resource_dict
import json
import time
################################################################################
# Main
################################################################################
def main():
"""Main function"""
module = GcpModule(
argument_spec=dict(
state=dict(default='present', choices=['present', 'absent'], type='str'),
name=dict(required=True, type='str'),
description=dict(type='str'),
accelerator_type=dict(required=True, type='str'),
tensorflow_version=dict(required=True, type='str'),
network=dict(type='str'),
cidr_block=dict(required=True, type='str'),
scheduling_config=dict(type='dict', options=dict(preemptible=dict(type='bool'))),
labels=dict(type='dict'),
zone=dict(required=True, type='str'),
)
)
if not module.params['scopes']:
module.params['scopes'] = ['https://www.googleapis.com/auth/cloud-platform']
state = module.params['state']
fetch = fetch_resource(module, self_link(module))
changed = False
if fetch:
if state == 'present':
if is_different(module, fetch):
update(module, self_link(module), fetch)
fetch = fetch_resource(module, self_link(module))
changed = True
else:
delete(module, self_link(module))
fetch = {}
changed = True
else:
if state == 'present':
fetch = create(module, create_link(module))
changed = True
else:
fetch = {}
fetch.update({'changed': changed})
module.exit_json(**fetch)
def create(module, link):
auth = GcpSession(module, 'tpu')
return wait_for_operation(module, auth.post(link, resource_to_request(module)))
def update(module, link, fetch):
update_fields(module, resource_to_request(module), response_to_hash(module, fetch))
return fetch_resource(module, self_link(module))
def update_fields(module, request, response):
if response.get('tensorflowVersion') != request.get('tensorflowVersion'):
tensorflow_version_update(module, request, response)
def tensorflow_version_update(module, request, response):
auth = GcpSession(module, 'tpu')
auth.post(
''.join(["https://tpu.googleapis.com/v1/", "projects/{project}/locations/{zone}/nodes/{name}:reimage"]).format(**module.params),
{u'tensorflowVersion': module.params.get('tensorflow_version')},
)
def delete(module, link):
auth = GcpSession(module, 'tpu')
return wait_for_operation(module, auth.delete(link))
def resource_to_request(module):
request = {
u'name': module.params.get('name'),
u'description': module.params.get('description'),
u'acceleratorType': module.params.get('accelerator_type'),
u'tensorflowVersion': module.params.get('tensorflow_version'),
u'network': module.params.get('network'),
u'cidrBlock': module.params.get('cidr_block'),
u'schedulingConfig': NodeSchedulingconfig(module.params.get('scheduling_config', {}), module).to_request(),
u'labels': module.params.get('labels'),
}
return_vals = {}
for k, v in request.items():
if v or v is False:
return_vals[k] = v
return return_vals
def fetch_resource(module, link, allow_not_found=True):
auth = GcpSession(module, 'tpu')
return return_if_object(module, auth.get(link), allow_not_found)
def self_link(module):
return "https://tpu.googleapis.com/v1/projects/{project}/locations/{zone}/nodes/{name}".format(**module.params)
def collection(module):
return "https://tpu.googleapis.com/v1/projects/{project}/locations/{zone}/nodes".format(**module.params)
def create_link(module):
return "https://tpu.googleapis.com/v1/projects/{project}/locations/{zone}/nodes?nodeId={name}".format(**module.params)
def return_if_object(module, response, allow_not_found=False):
# If not found, return nothing.
if allow_not_found and response.status_code == 404:
return None
# If no content, return nothing.
if response.status_code == 204:
return None
try:
module.raise_for_status(response)
result = response.json()
except getattr(json.decoder, 'JSONDecodeError', ValueError):
module.fail_json(msg="Invalid JSON response with error: %s" % response.text)
if navigate_hash(result, ['error', 'errors']):
module.fail_json(msg=navigate_hash(result, ['error', 'errors']))
return result
def is_different(module, response):
request = resource_to_request(module)
response = response_to_hash(module, response)
# Remove all output-only from response.
response_vals = {}
for k, v in response.items():
if k in request:
response_vals[k] = v
request_vals = {}
for k, v in request.items():
if k in response:
request_vals[k] = v
return GcpRequest(request_vals) != GcpRequest(response_vals)
# Remove unnecessary properties from the response.
# This is for doing comparisons with Ansible's current parameters.
def response_to_hash(module, response):
return {
u'name': module.params.get('name'),
u'description': module.params.get('description'),
u'acceleratorType': module.params.get('accelerator_type'),
u'tensorflowVersion': response.get(u'tensorflowVersion'),
u'network': module.params.get('network'),
u'cidrBlock': module.params.get('cidr_block'),
u'serviceAccount': response.get(u'serviceAccount'),
u'schedulingConfig': NodeSchedulingconfig(module.params.get('scheduling_config', {}), module).to_request(),
u'networkEndpoints': NodeNetworkendpointsArray(response.get(u'networkEndpoints', []), module).from_response(),
u'labels': module.params.get('labels'),
}
def async_op_url(module, extra_data=None):
if extra_data is None:
extra_data = {}
url = "https://tpu.googleapis.com/v1/{op_id}"
combined = extra_data.copy()
combined.update(module.params)
return url.format(**combined)
def wait_for_operation(module, response):
op_result = return_if_object(module, response)
if op_result is None:
return {}
status = navigate_hash(op_result, ['done'])
wait_done = wait_for_completion(status, op_result, module)
raise_if_errors(op_result, ['error'], module)
return navigate_hash(wait_done, ['response'])
def wait_for_completion(status, op_result, module):
op_id = navigate_hash(op_result, ['name'])
op_uri = async_op_url(module, {'op_id': op_id})
while not status:
raise_if_errors(op_result, ['error'], module)
time.sleep(1.0)
op_result = fetch_resource(module, op_uri, False)
status = navigate_hash(op_result, ['done'])
return op_result
def raise_if_errors(response, err_path, module):
errors = navigate_hash(response, err_path)
if errors is not None:
module.fail_json(msg=errors)
class NodeSchedulingconfig(object):
def __init__(self, request, module):
self.module = module
if request:
self.request = request
else:
self.request = {}
def to_request(self):
return remove_nones_from_dict({u'preemptible': self.request.get('preemptible')})
def from_response(self):
return remove_nones_from_dict({u'preemptible': self.request.get(u'preemptible')})
class NodeNetworkendpointsArray(object):
def __init__(self, request, module):
self.module = module
if request:
self.request = request
else:
self.request = []
def to_request(self):
items = []
for item in self.request:
items.append(self._request_for_item(item))
return items
def from_response(self):
items = []
for item in self.request:
items.append(self._response_from_item(item))
return items
def _request_for_item(self, item):
return remove_nones_from_dict({})
def _response_from_item(self, item):
return remove_nones_from_dict({})
if __name__ == '__main__':
main()

View file

@ -0,0 +1,2 @@
cloud/gcp
unsupported

View file

@ -0,0 +1,2 @@
---
resource_name: "{{ resource_prefix }}"

View file

@ -0,0 +1,120 @@
---
# ----------------------------------------------------------------------------
#
# *** AUTO GENERATED CODE *** AUTO GENERATED CODE ***
#
# ----------------------------------------------------------------------------
#
# This file is automatically generated by Magic Modules and manual
# changes will be clobbered when the file is regenerated.
#
# Please read more about how to change this file at
# https://www.github.com/GoogleCloudPlatform/magic-modules
#
# ----------------------------------------------------------------------------
# Pre-test setup
- name: delete a node
gcp_tpu_node:
name: "{{ resource_name }}"
zone: us-central1-b
accelerator_type: v3-8
tensorflow_version: '1.11'
cidr_block: 10.2.0.0/29
project: "{{ gcp_project }}"
auth_kind: "{{ gcp_cred_kind }}"
service_account_file: "{{ gcp_cred_file }}"
state: absent
#----------------------------------------------------------
- name: create a node
gcp_tpu_node:
name: "{{ resource_name }}"
zone: us-central1-b
accelerator_type: v3-8
tensorflow_version: '1.11'
cidr_block: 10.2.0.0/29
project: "{{ gcp_project }}"
auth_kind: "{{ gcp_cred_kind }}"
service_account_file: "{{ gcp_cred_file }}"
state: present
register: result
- name: assert changed is true
assert:
that:
- result.changed == true
- name: verify that node was created
gcp_tpu_node_facts:
zone: us-central1-b
project: "{{ gcp_project }}"
auth_kind: "{{ gcp_cred_kind }}"
service_account_file: "{{ gcp_cred_file }}"
scopes:
- https://www.googleapis.com/auth/cloud-platform
register: results
- name: verify that command succeeded
assert:
that:
- results['resources'] | length >= 1
# ----------------------------------------------------------------------------
- name: create a node that already exists
gcp_tpu_node:
name: "{{ resource_name }}"
zone: us-central1-b
accelerator_type: v3-8
tensorflow_version: '1.11'
cidr_block: 10.2.0.0/29
project: "{{ gcp_project }}"
auth_kind: "{{ gcp_cred_kind }}"
service_account_file: "{{ gcp_cred_file }}"
state: present
register: result
- name: assert changed is false
assert:
that:
- result.changed == false
#----------------------------------------------------------
- name: delete a node
gcp_tpu_node:
name: "{{ resource_name }}"
zone: us-central1-b
accelerator_type: v3-8
tensorflow_version: '1.11'
cidr_block: 10.2.0.0/29
project: "{{ gcp_project }}"
auth_kind: "{{ gcp_cred_kind }}"
service_account_file: "{{ gcp_cred_file }}"
state: absent
register: result
- name: assert changed is true
assert:
that:
- result.changed == true
- name: verify that node was deleted
gcp_tpu_node_facts:
zone: us-central1-b
project: "{{ gcp_project }}"
auth_kind: "{{ gcp_cred_kind }}"
service_account_file: "{{ gcp_cred_file }}"
scopes:
- https://www.googleapis.com/auth/cloud-platform
register: results
- name: verify that command succeeded
assert:
that:
- results['resources'] | length == 0
# ----------------------------------------------------------------------------
- name: delete a node that does not exist
gcp_tpu_node:
name: "{{ resource_name }}"
zone: us-central1-b
accelerator_type: v3-8
tensorflow_version: '1.11'
cidr_block: 10.2.0.0/29
project: "{{ gcp_project }}"
auth_kind: "{{ gcp_cred_kind }}"
service_account_file: "{{ gcp_cred_file }}"
state: absent
register: result
- name: assert changed is false
assert:
that:
- result.changed == false