gcpdiag.queries.dataproc

Queries related to Dataproc.

class Region: View Source

239class Region:
240  """Represents Dataproc region"""
241
242  project_id: str
243  region: str
244
245  def __init__(self, project_id: str, region: str):
246    self.project_id = project_id
247    self.region = region
248
249  def get_clusters(self, context: models.Context) -> Iterable[Cluster]:
250    clusters = []
251    for cluster in self.query_api():
252      if not context.match_project_resource(resource=cluster.get('clusterName'),
253                                            labels=cluster.get('labels', {})):
254        continue
255      c = Cluster(
256          name=cluster['clusterName'],
257          project_id=self.project_id,
258          resource_data=cluster,
259      )
260      clusters.append(c)
261    return clusters
262
263  def query_api(self) -> Iterable[dict]:
264    try:
265      api = apis.get_api('dataproc', 'v1', self.project_id)
266      query = (api.projects().regions().clusters().list(
267          projectId=self.project_id, region=self.region))
268      # be careful not to retry too many times because querying all regions
269      # sometimes causes requests to fail permanently
270      resp = query.execute(num_retries=1)
271      return resp.get('clusters', [])
272    except googleapiclient.errors.HttpError as err:
273      # b/371526148 investigate permission denied error
274      logging.error(err)
275      return []
276      # raise utils.GcpApiError(err) from err

Represents Dataproc region

Region(project_id: str, region: str) View Source

245  def __init__(self, project_id: str, region: str):
246    self.project_id = project_id
247    self.region = region

project_id: str

region: str

def get_clusters( self, context: gcpdiag.models.Context) -> Iterable[Cluster]: View Source

249  def get_clusters(self, context: models.Context) -> Iterable[Cluster]:
250    clusters = []
251    for cluster in self.query_api():
252      if not context.match_project_resource(resource=cluster.get('clusterName'),
253                                            labels=cluster.get('labels', {})):
254        continue
255      c = Cluster(
256          name=cluster['clusterName'],
257          project_id=self.project_id,
258          resource_data=cluster,
259      )
260      clusters.append(c)
261    return clusters

def query_api(self) -> Iterable[dict]: View Source

263  def query_api(self) -> Iterable[dict]:
264    try:
265      api = apis.get_api('dataproc', 'v1', self.project_id)
266      query = (api.projects().regions().clusters().list(
267          projectId=self.project_id, region=self.region))
268      # be careful not to retry too many times because querying all regions
269      # sometimes causes requests to fail permanently
270      resp = query.execute(num_retries=1)
271      return resp.get('clusters', [])
272    except googleapiclient.errors.HttpError as err:
273      # b/371526148 investigate permission denied error
274      logging.error(err)
275      return []
276      # raise utils.GcpApiError(err) from err

class Dataproc: View Source

279class Dataproc:
280  """Represents Dataproc product"""
281
282  project_id: str
283
284  def __init__(self, project_id: str):
285    self.project_id = project_id
286
287  def get_regions(self) -> Iterable[Region]:
288    return [
289        Region(self.project_id, r.name)
290        for r in gce.get_all_regions(self.project_id)
291    ]
292
293  def is_api_enabled(self) -> bool:
294    return apis.is_enabled(self.project_id, 'dataproc')

Represents Dataproc product

Dataproc(project_id: str) View Source

284  def __init__(self, project_id: str):
285    self.project_id = project_id

project_id: str

def get_regions(self) -> Iterable[Region]: View Source

287  def get_regions(self) -> Iterable[Region]:
288    return [
289        Region(self.project_id, r.name)
290        for r in gce.get_all_regions(self.project_id)
291    ]

def is_api_enabled(self) -> bool: View Source

293  def is_api_enabled(self) -> bool:
294    return apis.is_enabled(self.project_id, 'dataproc')

@caching.cached_api_call

def get_clusters( context: gcpdiag.models.Context) -> Iterable[Cluster]: View Source

297@caching.cached_api_call
298def get_clusters(context: models.Context) -> Iterable[Cluster]:
299  r: List[Cluster] = []
300  dataproc = Dataproc(context.project_id)
301  if not dataproc.is_api_enabled():
302    return r
303  executor = get_executor()
304  for clusters in executor.map(lambda r: r.get_clusters(context),
305                               dataproc.get_regions()):
306    r += clusters
307  return r

@caching.cached_api_call

def get_cluster( cluster_name, region, project) -> Optional[Cluster]: View Source

310@caching.cached_api_call
311def get_cluster(cluster_name, region, project) -> Optional[Cluster]:
312  api = apis.get_api('dataproc', 'v1', project)
313  request = api.projects().regions().clusters().get(projectId=project,
314                                                    clusterName=cluster_name,
315                                                    region=region)
316  try:
317    r = request.execute(num_retries=config.API_RETRIES)
318  except (googleapiclient.errors.HttpError,
319          requests.exceptions.RequestException):
320    #logging.error(err)
321    return None
322  return Cluster(r['clusterName'], project_id=r['projectId'], resource_data=r)

class AutoScalingPolicy(gcpdiag.models.Resource): View Source

325class AutoScalingPolicy(models.Resource):
326  """AutoScalingPolicy."""
327
328  _resource_data: dict
329
330  def __init__(self, project_id, resource_data, region):
331    super().__init__(project_id=project_id)
332    self._resource_data = resource_data
333    self.region = region
334
335  @property
336  def policy_id(self) -> str:
337    return self._resource_data['id']
338
339  @property
340  def full_path(self) -> str:
341    return self._resource_data['name']
342
343  @property
344  def short_path(self) -> str:
345    return f'{self.project_id}/{self.region}/{self.policy_id}'
346
347  @property
348  def name(self) -> str:
349    return self._resource_data['name']
350
351  @property
352  def scale_down_factor(self) -> float:
353    return self._resource_data['basicAlgorithm']['yarnConfig'].get(
354        'scaleDownFactor', 0.0)
355
356  @property
357  def has_graceful_decommission_timeout(self) -> bool:
358    """Checks if a graceful decommission timeout is configured in the autoscaling policy."""
359    return bool(
360        self._resource_data.get('basicAlgorithm',
361                                {}).get('yarnConfig',
362                                        {}).get('gracefulDecommissionTimeout',
363                                                {}))
364
365  @property
366  def graceful_decommission_timeout(self) -> float:
367    """Gets the configured graceful decommission timeout in the autoscaling policy."""
368    return (self._resource_data.get('basicAlgorithm',
369                                    {}).get('yarnConfig', {}).get(
370                                        'gracefulDecommissionTimeout', -1))

AutoScalingPolicy.

AutoScalingPolicy(project_id, resource_data, region) View Source

330  def __init__(self, project_id, resource_data, region):
331    super().__init__(project_id=project_id)
332    self._resource_data = resource_data
333    self.region = region

region

policy_id: str View Source

335  @property
336  def policy_id(self) -> str:
337    return self._resource_data['id']

full_path: str View Source

339  @property
340  def full_path(self) -> str:
341    return self._resource_data['name']

Returns the full path of this resource.

Example: 'projects/gcpdiag-gke-1-9b90/zones/europe-west4-a/clusters/gke1'

short_path: str View Source

343  @property
344  def short_path(self) -> str:
345    return f'{self.project_id}/{self.region}/{self.policy_id}'

Returns the short name for this resource.

Note that it isn't clear from this name what kind of resource it is.

Example: 'gke1'

name: str View Source

347  @property
348  def name(self) -> str:
349    return self._resource_data['name']

scale_down_factor: float View Source

351  @property
352  def scale_down_factor(self) -> float:
353    return self._resource_data['basicAlgorithm']['yarnConfig'].get(
354        'scaleDownFactor', 0.0)

has_graceful_decommission_timeout: bool View Source

356  @property
357  def has_graceful_decommission_timeout(self) -> bool:
358    """Checks if a graceful decommission timeout is configured in the autoscaling policy."""
359    return bool(
360        self._resource_data.get('basicAlgorithm',
361                                {}).get('yarnConfig',
362                                        {}).get('gracefulDecommissionTimeout',
363                                                {}))

Checks if a graceful decommission timeout is configured in the autoscaling policy.

graceful_decommission_timeout: float View Source

365  @property
366  def graceful_decommission_timeout(self) -> float:
367    """Gets the configured graceful decommission timeout in the autoscaling policy."""
368    return (self._resource_data.get('basicAlgorithm',
369                                    {}).get('yarnConfig', {}).get(
370                                        'gracefulDecommissionTimeout', -1))

Gets the configured graceful decommission timeout in the autoscaling policy.

@caching.cached_api_call

def get_auto_scaling_policy( project_id: str, region: str, policy_id: str) -> AutoScalingPolicy: View Source

373@caching.cached_api_call
374def get_auto_scaling_policy(project_id: str, region: str,
375                            policy_id: str) -> AutoScalingPolicy:
376  logging.debug('fetching autoscalingpolicy: %s', project_id)
377  dataproc = apis.get_api('dataproc', 'v1', project_id)
378  name = (
379      f'projects/{project_id}/regions/{region}/autoscalingPolicies/{policy_id}')
380  try:
381    request = dataproc.projects().regions().autoscalingPolicies().get(name=name)
382    response = request.execute(num_retries=config.API_RETRIES)
383    return AutoScalingPolicy(project_id, response, region)
384  except googleapiclient.errors.HttpError as err:
385    raise utils.GcpApiError(err) from err

@caching.cached_api_call

def list_auto_scaling_policies( project_id: str, region: str) -> List[AutoScalingPolicy]: View Source

388@caching.cached_api_call
389def list_auto_scaling_policies(project_id: str,
390                               region: str) -> List[AutoScalingPolicy]:
391  """Lists all autoscaling policies in the given project and region."""
392  dataproc = apis.get_api('dataproc', 'v1', project_id)
393  parent = f'projects/{project_id}/regions/{region}'
394  try:
395    request = (dataproc.projects().regions().autoscalingPolicies().list(
396        parent=parent))
397    response = request.execute(num_retries=config.API_RETRIES)
398    return [
399        AutoScalingPolicy(project_id, policy_data, region)
400        for policy_data in response.get('policies', [])
401    ]
402  except googleapiclient.errors.HttpError as err:
403    raise utils.GcpApiError(err) from err

Lists all autoscaling policies in the given project and region.

@caching.cached_api_call

def get_job_by_jobid(project_id: str, region: str, job_id: str): View Source

483@caching.cached_api_call
484def get_job_by_jobid(project_id: str, region: str, job_id: str):
485  dataproc = apis.get_api('dataproc', 'v1', project_id)
486  try:
487    request = (dataproc.projects().regions().jobs().get(projectId=project_id,
488                                                        region=region,
489                                                        jobId=job_id))
490    response = request.execute(num_retries=config.API_RETRIES)
491    return Job(project_id, region, job_id, response)
492  except googleapiclient.errors.HttpError as err:
493    raise utils.GcpApiError(err) from err

@caching.cached_api_call

def extract_dataproc_supported_version() -> list[str]: View Source

496@caching.cached_api_call
497def extract_dataproc_supported_version() -> list[str]:
498  """Extract the supported Dataproc versions(use Debian as representative).
499  """
500
501  page_url = 'https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-version-clusters'
502
503  try:
504    table = web.fetch_and_extract_table(page_url,
505                                        tag='h3',
506                                        tag_id='debian_images')
507    if table:
508      rows = table.find_all('tr')[1:]  #Skip the header row
509      version_list = []
510
511      for row in rows:
512        dp_version = row.find_all('td')[0].get_text().strip().split('-')[0]
513        version_list.append(dp_version)
514      return version_list
515
516    else:
517      return []
518  except (
519      requests.exceptions.RequestException,
520      AttributeError,
521      TypeError,
522      ValueError,
523      IndexError,
524  ) as e:
525    logging.error(
526        'Error in extracting dataproc versions: %s',
527        e,
528    )
529    return []

Extract the supported Dataproc versions(use Debian as representative).

@caching.cached_api_call

def extract_dataproc_bigquery_version(image_version) -> list[str]: View Source

532@caching.cached_api_call
533def extract_dataproc_bigquery_version(image_version) -> list[str]:
534  """Extract Dataproc BigQuery connector versions based on image version GCP documentation.
535  """
536
537  page_url = ('https://cloud.google.com/dataproc/docs/concepts/versioning/'
538              'dataproc-release-' + image_version)
539
540  try:
541    table = web.fetch_and_extract_table(page_url, tag='div')
542    bq_version = []
543    if table:
544      rows = table.find_all('tr')[1:]
545      for row in rows:
546        cells = row.find_all('td')
547        if 'BigQuery Connector' in cells[0].get_text(strip=True):
548          bq_version = cells[1].get_text(strip=True)
549    return bq_version
550  except (
551      requests.exceptions.RequestException,
552      AttributeError,
553      TypeError,
554      ValueError,
555      IndexError,
556  ) as e:
557    logging.error(
558        '%s Error in extracting BigQuery connector versions.'
559        '  Please check BigQuery Connector version on %s',
560        e,
561        page_url,
562    )
563    return []

Extract Dataproc BigQuery connector versions based on image version GCP documentation.

built with pdoc logo