| #!/usr/bin/env python |
| # |
| # Copyright 2007 Google Inc. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """Defines input readers for MapReduce.""" |
| |
| |
| __all__ = [ |
| "AbstractDatastoreInputReader", |
| "ALLOW_CHECKPOINT", |
| "BadReaderParamsError", |
| "BlobstoreLineInputReader", |
| "BlobstoreZipInputReader", |
| "BlobstoreZipLineInputReader", |
| "COUNTER_IO_READ_BYTES", |
| "COUNTER_IO_READ_MSEC", |
| "DatastoreEntityInputReader", |
| "DatastoreInputReader", |
| "DatastoreKeyInputReader", |
| "FileInputReader", |
| "RandomStringInputReader", |
| "RawDatastoreInputReader", |
| "Error", |
| "InputReader", |
| "LogInputReader", |
| "NamespaceInputReader", |
| "RecordsReader", |
| ] |
| |
| |
| |
| |
| import base64 |
| import copy |
| import logging |
| import pickle |
| import random |
| import string |
| import StringIO |
| import time |
| import zipfile |
| |
| from google.net.proto import ProtocolBuffer |
| from google.appengine.ext import ndb |
| |
| from google.appengine.api import datastore |
| from google.appengine.api import files |
| from google.appengine.api import logservice |
| from google.appengine.api.files import file_service_pb |
| from google.appengine.api.logservice import log_service_pb |
| from google.appengine.ext import blobstore |
| from google.appengine.ext import db |
| from google.appengine.ext import key_range |
| from google.appengine.ext.db import metadata |
| from google.appengine.ext.mapreduce import context |
| from google.appengine.ext.mapreduce import datastore_range_iterators as db_iters |
| from google.appengine.ext.mapreduce import errors |
| from google.appengine.ext.mapreduce import file_format_parser |
| from google.appengine.ext.mapreduce import file_format_root |
| from google.appengine.ext.mapreduce import json_util |
| from google.appengine.ext.mapreduce import key_ranges |
| from google.appengine.ext.mapreduce import model |
| from google.appengine.ext.mapreduce import namespace_range |
| from google.appengine.ext.mapreduce import operation |
| from google.appengine.ext.mapreduce import property_range |
| from google.appengine.ext.mapreduce import records |
| from google.appengine.ext.mapreduce import util |
| |
| |
| |
| try: |
| |
| from google.appengine.ext import cloudstorage |
| if hasattr(cloudstorage, "_STUB"): |
| cloudstorage = None |
| except ImportError: |
| pass |
| |
| |
| |
| Error = errors.Error |
| BadReaderParamsError = errors.BadReaderParamsError |
| |
| |
| |
| COUNTER_IO_READ_BYTES = "io-read-bytes" |
| |
| |
| COUNTER_IO_READ_MSEC = "io-read-msec" |
| |
| |
| |
| |
| ALLOW_CHECKPOINT = object() |
| |
| |
| class InputReader(json_util.JsonMixin): |
| """Abstract base class for input readers. |
| |
| InputReaders have the following properties: |
| * They are created by using the split_input method to generate a set of |
| InputReaders from a MapperSpec. |
| * They generate inputs to the mapper via the iterator interface. |
| * After creation, they can be serialized and resumed using the JsonMixin |
| interface. |
| * They are cast to string for a user-readable description; it may be |
| valuable to implement __str__. |
| """ |
| |
| |
| |
| |
| expand_parameters = False |
| |
| |
| _APP_PARAM = "_app" |
| NAMESPACE_PARAM = "namespace" |
| NAMESPACES_PARAM = "namespaces" |
| |
| def __iter__(self): |
| return self |
| |
| def next(self): |
| """Returns the next input from this input reader as a key, value pair. |
| |
| Returns: |
| The next input from this input reader. |
| """ |
| raise NotImplementedError("next() not implemented in %s" % self.__class__) |
| |
| @classmethod |
| def from_json(cls, input_shard_state): |
| """Creates an instance of the InputReader for the given input shard state. |
| |
| Args: |
| input_shard_state: The InputReader state as a dict-like object. |
| |
| Returns: |
| An instance of the InputReader configured using the values of json. |
| """ |
| raise NotImplementedError("from_json() not implemented in %s" % cls) |
| |
| def to_json(self): |
| """Returns an input shard state for the remaining inputs. |
| |
| Returns: |
| A json-izable version of the remaining InputReader. |
| """ |
| raise NotImplementedError("to_json() not implemented in %s" % |
| self.__class__) |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Returns a list of input readers. |
| |
| This method creates a list of input readers, each for one shard. |
| It attempts to split inputs among readers evenly. |
| |
| Args: |
| mapper_spec: model.MapperSpec specifies the inputs and additional |
| parameters to define the behavior of input readers. |
| |
| Returns: |
| A list of InputReaders. None or [] when no input data can be found. |
| """ |
| raise NotImplementedError("split_input() not implemented in %s" % cls) |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Validates mapper spec and all mapper parameters. |
| |
| Input reader parameters are expected to be passed as "input_reader" |
| subdictionary in mapper_spec.params. |
| |
| Pre 1.6.4 API mixes input reader parameters with all other parameters. Thus |
| to be compatible, input reader check mapper_spec.params as well and |
| issue a warning if "input_reader" subdicationary is not present. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Raises: |
| BadReaderParamsError: required parameters are missing or invalid. |
| """ |
| if mapper_spec.input_reader_class() != cls: |
| raise BadReaderParamsError("Input reader class mismatch") |
| |
| |
| def _get_params(mapper_spec, allowed_keys=None, allow_old=True): |
| """Obtain input reader parameters. |
| |
| Utility function for input readers implementation. Fetches parameters |
| from mapreduce specification giving appropriate usage warnings. |
| |
| Args: |
| mapper_spec: The MapperSpec for the job |
| allowed_keys: set of all allowed keys in parameters as strings. If it is not |
| None, then parameters are expected to be in a separate "input_reader" |
| subdictionary of mapper_spec parameters. |
| allow_old: Allow parameters to exist outside of the input_reader |
| subdictionary for compatability. |
| |
| Returns: |
| mapper parameters as dict |
| |
| Raises: |
| BadReaderParamsError: if parameters are invalid/missing or not allowed. |
| """ |
| if "input_reader" not in mapper_spec.params: |
| message = ("Input reader's parameters should be specified in " |
| "input_reader subdictionary.") |
| if not allow_old or allowed_keys: |
| raise errors.BadReaderParamsError(message) |
| params = mapper_spec.params |
| params = dict((str(n), v) for n, v in params.iteritems()) |
| else: |
| if not isinstance(mapper_spec.params.get("input_reader"), dict): |
| raise errors.BadReaderParamsError( |
| "Input reader parameters should be a dictionary") |
| params = mapper_spec.params.get("input_reader") |
| params = dict((str(n), v) for n, v in params.iteritems()) |
| if allowed_keys: |
| params_diff = set(params.keys()) - allowed_keys |
| if params_diff: |
| raise errors.BadReaderParamsError( |
| "Invalid input_reader parameters: %s" % ",".join(params_diff)) |
| return params |
| |
| |
| class FileInputReader(InputReader): |
| """Reader to read Files API files of user specified format. |
| |
| This class currently only supports Google Storage files. It will be extended |
| to support blobstore files in the future. |
| |
| Reader Parameters: |
| files: a list of filenames or filename patterns. |
| filename must be of format '/gs/bucket/filename'. |
| filename pattern has format '/gs/bucket/prefix*'. |
| filename pattern will be expanded to filenames with the given prefix. |
| Please see parseGlob in the file api.files.gs.py which is included in the |
| App Engine SDK for supported patterns. |
| |
| Example: |
| ["/gs/bucket1/file1", "/gs/bucket2/*", "/gs/bucket3/p*"] |
| includes "file1", all files under bucket2, and files under bucket3 with |
| a prefix "p" in its name. |
| |
| format: format string determines what your map function gets as its input. |
| format string can be "lines", "bytes", "zip", or a cascade of them plus |
| optional parameters. See file_formats.FORMATS for all supported formats. |
| See file_format_parser._FileFormatParser for format string syntax. |
| |
| Example: |
| "lines": your map function gets files' contents line by line. |
| "bytes": your map function gets files' contents entirely. |
| "zip": InputReader unzips files and feeds your map function each of |
| the archive's member files as a whole. |
| "zip[bytes]: same as above. |
| "zip[lines]": InputReader unzips files and feeds your map function |
| files' contents line by line. |
| "zip[lines(encoding=utf32)]": InputReader unzips files, reads each |
| file with utf32 encoding and feeds your map function line by line. |
| "base64[zip[lines(encoding=utf32)]]: InputReader decodes files with |
| base64 encoding, unzips each file, reads each of them with utf32 |
| encoding and feeds your map function line by line. |
| |
| Note that "encoding" only teaches InputReader how to interpret files. |
| The input your map function gets is always a Python str. |
| """ |
| |
| |
| FILES_PARAM = "files" |
| FORMAT_PARAM = "format" |
| |
| def __init__(self, format_root): |
| """Initialize input reader. |
| |
| Args: |
| format_root: a FileFormatRoot instance. |
| """ |
| self._file_format_root = format_root |
| |
| def __iter__(self): |
| """Inherit docs.""" |
| return self |
| |
| def next(self): |
| """Inherit docs.""" |
| ctx = context.get() |
| start_time = time.time() |
| |
| content = self._file_format_root.next().read() |
| |
| if ctx: |
| operation.counters.Increment( |
| COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx) |
| operation.counters.Increment(COUNTER_IO_READ_BYTES, len(content))(ctx) |
| |
| return content |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Inherit docs.""" |
| params = _get_params(mapper_spec) |
| |
| |
| filenames = [] |
| for f in params[cls.FILES_PARAM]: |
| parsedName = files.gs.parseGlob(f) |
| if isinstance(parsedName, tuple): |
| filenames.extend(files.gs.listdir(parsedName[0], |
| {"prefix": parsedName[1]})) |
| else: |
| filenames.append(parsedName) |
| |
| file_format_roots = file_format_root.split(filenames, |
| params[cls.FORMAT_PARAM], |
| mapper_spec.shard_count) |
| |
| if file_format_roots is None: |
| return [] |
| return [cls(root) for root in file_format_roots] |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Inherit docs.""" |
| if mapper_spec.input_reader_class() != cls: |
| raise BadReaderParamsError("Mapper input reader class mismatch") |
| |
| |
| params = _get_params(mapper_spec) |
| if cls.FILES_PARAM not in params: |
| raise BadReaderParamsError("Must specify %s" % cls.FILES_PARAM) |
| if cls.FORMAT_PARAM not in params: |
| raise BadReaderParamsError("Must specify %s" % cls.FORMAT_PARAM) |
| |
| format_string = params[cls.FORMAT_PARAM] |
| if not isinstance(format_string, basestring): |
| raise BadReaderParamsError("format should be string but is %s" % |
| cls.FORMAT_PARAM) |
| try: |
| file_format_parser.parse(format_string) |
| except ValueError, e: |
| raise BadReaderParamsError(e) |
| |
| paths = params[cls.FILES_PARAM] |
| if not (paths and isinstance(paths, list)): |
| raise BadReaderParamsError("files should be a list of filenames.") |
| |
| |
| try: |
| for path in paths: |
| files.gs.parseGlob(path) |
| except files.InvalidFileNameError: |
| raise BadReaderParamsError("Invalid filename %s." % path) |
| |
| @classmethod |
| def from_json(cls, json): |
| """Inherit docs.""" |
| return cls( |
| file_format_root.FileFormatRoot.from_json(json["file_format_root"])) |
| |
| def to_json(self): |
| """Inherit docs.""" |
| return {"file_format_root": self._file_format_root.to_json()} |
| |
| |
| class AbstractDatastoreInputReader(InputReader): |
| """Abstract class for datastore input readers.""" |
| |
| |
| _BATCH_SIZE = 50 |
| |
| |
| _MAX_SHARD_COUNT = 256 |
| |
| |
| |
| |
| MAX_NAMESPACES_FOR_KEY_SHARD = 10 |
| |
| |
| ENTITY_KIND_PARAM = "entity_kind" |
| KEYS_ONLY_PARAM = "keys_only" |
| BATCH_SIZE_PARAM = "batch_size" |
| KEY_RANGE_PARAM = "key_range" |
| FILTERS_PARAM = "filters" |
| |
| _KEY_RANGE_ITER_CLS = db_iters.AbstractKeyRangeIterator |
| |
| def __init__(self, iterator): |
| """Create new DatastoreInputReader object. |
| |
| This is internal constructor. Use split_input to create readers instead. |
| |
| Args: |
| iterator: an iterator that generates objects for this input reader. |
| """ |
| self._iter = iterator |
| |
| def __iter__(self): |
| """Yields whatever internal iterator yields.""" |
| for o in self._iter: |
| yield o |
| |
| def __str__(self): |
| """Returns the string representation of this InputReader.""" |
| return repr(self._iter) |
| |
| def to_json(self): |
| """Serializes input reader to json compatible format. |
| |
| Returns: |
| all the data in json-compatible map. |
| """ |
| return self._iter.to_json() |
| |
| @classmethod |
| def from_json(cls, json): |
| """Create new DatastoreInputReader from json, encoded by to_json. |
| |
| Args: |
| json: json representation of DatastoreInputReader. |
| |
| Returns: |
| an instance of DatastoreInputReader with all data deserialized from json. |
| """ |
| return cls(db_iters.RangeIteratorFactory.from_json(json)) |
| |
| @classmethod |
| def _get_query_spec(cls, mapper_spec): |
| """Construct a model.QuerySpec from model.MapperSpec.""" |
| params = _get_params(mapper_spec) |
| entity_kind = params[cls.ENTITY_KIND_PARAM] |
| filters = params.get(cls.FILTERS_PARAM) |
| app = params.get(cls._APP_PARAM) |
| ns = params.get(cls.NAMESPACE_PARAM) |
| |
| return model.QuerySpec( |
| entity_kind=cls._get_raw_entity_kind(entity_kind), |
| keys_only=bool(params.get(cls.KEYS_ONLY_PARAM, False)), |
| filters=filters, |
| batch_size=int(params.get(cls.BATCH_SIZE_PARAM, cls._BATCH_SIZE)), |
| model_class_path=entity_kind, |
| app=app, |
| ns=ns) |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Inherit doc.""" |
| shard_count = mapper_spec.shard_count |
| query_spec = cls._get_query_spec(mapper_spec) |
| |
| namespaces = None |
| if query_spec.ns is not None: |
| k_ranges = cls._to_key_ranges_by_shard( |
| query_spec.app, [query_spec.ns], shard_count, query_spec) |
| else: |
| ns_keys = namespace_range.get_namespace_keys( |
| query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD+1) |
| |
| |
| if not ns_keys: |
| return |
| |
| |
| elif len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD: |
| namespaces = [ns_key.name() or "" for ns_key in ns_keys] |
| k_ranges = cls._to_key_ranges_by_shard( |
| query_spec.app, namespaces, shard_count, query_spec) |
| |
| else: |
| ns_ranges = namespace_range.NamespaceRange.split(n=shard_count, |
| contiguous=False, |
| can_query=lambda: True, |
| _app=query_spec.app) |
| k_ranges = [key_ranges.KeyRangesFactory.create_from_ns_range(ns_range) |
| for ns_range in ns_ranges] |
| |
| iters = [db_iters.RangeIteratorFactory.create_key_ranges_iterator( |
| r, query_spec, cls._KEY_RANGE_ITER_CLS) for r in k_ranges] |
| |
| return [cls(i) for i in iters] |
| |
| @classmethod |
| def _to_key_ranges_by_shard(cls, app, namespaces, shard_count, query_spec): |
| """Get a list of key_ranges.KeyRanges objects, one for each shard. |
| |
| This method uses scatter index to split each namespace into pieces |
| and assign those pieces to shards. |
| |
| Args: |
| app: app_id in str. |
| namespaces: a list of namespaces in str. |
| shard_count: number of shards to split. |
| query_spec: model.QuerySpec. |
| |
| Returns: |
| a list of key_ranges.KeyRanges objects. |
| """ |
| key_ranges_by_ns = [] |
| |
| |
| for namespace in namespaces: |
| ranges = cls._split_ns_by_scatter( |
| shard_count, |
| namespace, |
| query_spec.entity_kind, |
| app) |
| |
| |
| random.shuffle(ranges) |
| key_ranges_by_ns.append(ranges) |
| |
| |
| |
| |
| ranges_by_shard = [[] for _ in range(shard_count)] |
| for ranges in key_ranges_by_ns: |
| for i, k_range in enumerate(ranges): |
| if k_range: |
| ranges_by_shard[i].append(k_range) |
| |
| key_ranges_by_shard = [] |
| for ranges in ranges_by_shard: |
| if ranges: |
| key_ranges_by_shard.append(key_ranges.KeyRangesFactory.create_from_list( |
| ranges)) |
| return key_ranges_by_shard |
| |
| @classmethod |
| def _split_ns_by_scatter(cls, |
| shard_count, |
| namespace, |
| raw_entity_kind, |
| app): |
| """Split a namespace by scatter index into key_range.KeyRange. |
| |
| TODO: Power this with key_range.KeyRange.compute_split_points. |
| |
| Args: |
| shard_count: number of shards. |
| namespace: namespace name to split. str. |
| raw_entity_kind: low level datastore API entity kind. |
| app: app id in str. |
| |
| Returns: |
| A list of key_range.KeyRange objects. If there are not enough entities to |
| splits into requested shards, the returned list will contain KeyRanges |
| ordered lexicographically with any Nones appearing at the end. |
| """ |
| if shard_count == 1: |
| |
| return [key_range.KeyRange(namespace=namespace, _app=app)] |
| |
| ds_query = datastore.Query(kind=raw_entity_kind, |
| namespace=namespace, |
| _app=app, |
| keys_only=True) |
| ds_query.Order("__scatter__") |
| oversampling_factor = 32 |
| random_keys = ds_query.Get(shard_count * oversampling_factor) |
| |
| if not random_keys: |
| |
| |
| return ([key_range.KeyRange(namespace=namespace, _app=app)] + |
| [None] * (shard_count - 1)) |
| |
| random_keys.sort() |
| |
| if len(random_keys) >= shard_count: |
| |
| random_keys = cls._choose_split_points(random_keys, shard_count) |
| |
| k_ranges = [] |
| |
| k_ranges.append(key_range.KeyRange( |
| key_start=None, |
| key_end=random_keys[0], |
| direction=key_range.KeyRange.ASC, |
| include_start=False, |
| include_end=False, |
| namespace=namespace, |
| _app=app)) |
| |
| for i in range(0, len(random_keys) - 1): |
| k_ranges.append(key_range.KeyRange( |
| key_start=random_keys[i], |
| key_end=random_keys[i+1], |
| direction=key_range.KeyRange.ASC, |
| include_start=True, |
| include_end=False, |
| namespace=namespace, |
| _app=app)) |
| |
| k_ranges.append(key_range.KeyRange( |
| key_start=random_keys[-1], |
| key_end=None, |
| direction=key_range.KeyRange.ASC, |
| include_start=True, |
| include_end=False, |
| namespace=namespace, |
| _app=app)) |
| |
| if len(k_ranges) < shard_count: |
| |
| k_ranges += [None] * (shard_count - len(k_ranges)) |
| return k_ranges |
| |
| @classmethod |
| def _choose_split_points(cls, sorted_keys, shard_count): |
| """Returns the best split points given a random set of datastore.Keys.""" |
| assert len(sorted_keys) >= shard_count |
| index_stride = len(sorted_keys) / float(shard_count) |
| return [sorted_keys[int(round(index_stride * i))] |
| for i in range(1, shard_count)] |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Inherit docs.""" |
| params = _get_params(mapper_spec) |
| if cls.ENTITY_KIND_PARAM not in params: |
| raise BadReaderParamsError("Missing input reader parameter 'entity_kind'") |
| if cls.BATCH_SIZE_PARAM in params: |
| try: |
| batch_size = int(params[cls.BATCH_SIZE_PARAM]) |
| if batch_size < 1: |
| raise BadReaderParamsError("Bad batch size: %s" % batch_size) |
| except ValueError, e: |
| raise BadReaderParamsError("Bad batch size: %s" % e) |
| try: |
| bool(params.get(cls.KEYS_ONLY_PARAM, False)) |
| except: |
| raise BadReaderParamsError("keys_only expects a boolean value but got %s", |
| params[cls.KEYS_ONLY_PARAM]) |
| if cls.NAMESPACE_PARAM in params: |
| if not isinstance(params[cls.NAMESPACE_PARAM], |
| (str, unicode, type(None))): |
| raise BadReaderParamsError( |
| "Expected a single namespace string") |
| if cls.NAMESPACES_PARAM in params: |
| raise BadReaderParamsError("Multiple namespaces are no longer supported") |
| if cls.FILTERS_PARAM in params: |
| filters = params[cls.FILTERS_PARAM] |
| if not isinstance(filters, list): |
| raise BadReaderParamsError("Expected list for filters parameter") |
| for f in filters: |
| if not isinstance(f, (tuple, list)): |
| raise BadReaderParamsError("Filter should be a tuple or list: %s", f) |
| if len(f) != 3: |
| raise BadReaderParamsError("Filter should be a 3-tuple: %s", f) |
| prop, op, _ = f |
| if not isinstance(prop, basestring): |
| raise BadReaderParamsError("Property should be string: %s", prop) |
| if not isinstance(op, basestring): |
| raise BadReaderParamsError("Operator should be string: %s", op) |
| |
| @classmethod |
| def _get_raw_entity_kind(cls, entity_kind_or_model_classpath): |
| """Returns the entity kind to use with low level datastore calls. |
| |
| Args: |
| entity_kind_or_model_classpath: user specified entity kind or model |
| classpath. |
| |
| Returns: |
| the entity kind in str to use with low level datastore calls. |
| """ |
| return entity_kind_or_model_classpath |
| |
| |
| class RawDatastoreInputReader(AbstractDatastoreInputReader): |
| """Iterates over an entity kind and yields datastore.Entity.""" |
| |
| _KEY_RANGE_ITER_CLS = db_iters.KeyRangeEntityIterator |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Inherit docs.""" |
| super(RawDatastoreInputReader, cls).validate(mapper_spec) |
| params = _get_params(mapper_spec) |
| entity_kind = params[cls.ENTITY_KIND_PARAM] |
| if "." in entity_kind: |
| logging.warning( |
| ". detected in entity kind %s specified for reader %s." |
| "Assuming entity kind contains the dot.", |
| entity_kind, cls.__name__) |
| if cls.FILTERS_PARAM in params: |
| filters = params[cls.FILTERS_PARAM] |
| for f in filters: |
| if f[1] != "=": |
| raise BadReaderParamsError( |
| "Only equality filters are supported: %s", f) |
| |
| |
| class DatastoreInputReader(AbstractDatastoreInputReader): |
| """Iterates over a Model and yields model instances. |
| |
| Supports both db.model and ndb.model. |
| """ |
| |
| _KEY_RANGE_ITER_CLS = db_iters.KeyRangeModelIterator |
| |
| @classmethod |
| def _get_raw_entity_kind(cls, model_classpath): |
| entity_type = util.for_name(model_classpath) |
| if isinstance(entity_type, db.Model): |
| return entity_type.kind() |
| elif isinstance(entity_type, (ndb.Model, ndb.MetaModel)): |
| |
| return entity_type._get_kind() |
| else: |
| return util.get_short_name(model_classpath) |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Inherit docs.""" |
| super(DatastoreInputReader, cls).validate(mapper_spec) |
| params = _get_params(mapper_spec) |
| entity_kind = params[cls.ENTITY_KIND_PARAM] |
| |
| try: |
| model_class = util.for_name(entity_kind) |
| except ImportError, e: |
| raise BadReaderParamsError("Bad entity kind: %s" % e) |
| if cls.FILTERS_PARAM in params: |
| filters = params[cls.FILTERS_PARAM] |
| if issubclass(model_class, db.Model): |
| cls._validate_filters(filters, model_class) |
| else: |
| cls._validate_filters_ndb(filters, model_class) |
| property_range.PropertyRange(filters, entity_kind) |
| |
| @classmethod |
| def _validate_filters(cls, filters, model_class): |
| """Validate user supplied filters. |
| |
| Validate filters are on existing properties and filter values |
| have valid semantics. |
| |
| Args: |
| filters: user supplied filters. Each filter should be a list or tuple of |
| format (<property_name_as_str>, <query_operator_as_str>, |
| <value_of_certain_type>). Value type is up to the property's type. |
| model_class: the db.Model class for the entity type to apply filters on. |
| |
| Raises: |
| BadReaderParamsError: if any filter is invalid in any way. |
| """ |
| if not filters: |
| return |
| |
| properties = model_class.properties() |
| |
| for f in filters: |
| prop, _, val = f |
| if prop not in properties: |
| raise errors.BadReaderParamsError( |
| "Property %s is not defined for entity type %s", |
| prop, model_class.kind()) |
| |
| |
| |
| try: |
| properties[prop].validate(val) |
| except db.BadValueError, e: |
| raise errors.BadReaderParamsError(e) |
| |
| @classmethod |
| |
| def _validate_filters_ndb(cls, filters, model_class): |
| """Validate ndb.Model filters.""" |
| if not filters: |
| return |
| |
| properties = model_class._properties |
| |
| for f in filters: |
| prop, _, val = f |
| if prop not in properties: |
| raise errors.BadReaderParamsError( |
| "Property %s is not defined for entity type %s", |
| prop, model_class._get_kind()) |
| |
| |
| |
| try: |
| properties[prop]._do_validate(val) |
| except db.BadValueError, e: |
| raise errors.BadReaderParamsError(e) |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Inherit docs.""" |
| shard_count = mapper_spec.shard_count |
| query_spec = cls._get_query_spec(mapper_spec) |
| |
| if not property_range.should_shard_by_property_range(query_spec.filters): |
| return super(DatastoreInputReader, cls).split_input(mapper_spec) |
| |
| p_range = property_range.PropertyRange(query_spec.filters, |
| query_spec.model_class_path) |
| p_ranges = p_range.split(shard_count) |
| |
| |
| if query_spec.ns: |
| ns_range = namespace_range.NamespaceRange( |
| namespace_start=query_spec.ns, |
| namespace_end=query_spec.ns, |
| _app=query_spec.app) |
| ns_ranges = [copy.copy(ns_range) for _ in p_ranges] |
| else: |
| ns_keys = namespace_range.get_namespace_keys( |
| query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD+1) |
| if not ns_keys: |
| return |
| |
| |
| if len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD: |
| ns_ranges = [namespace_range.NamespaceRange(_app=query_spec.app) |
| for _ in p_ranges] |
| |
| else: |
| ns_ranges = namespace_range.NamespaceRange.split(n=shard_count, |
| contiguous=False, |
| can_query=lambda: True, |
| _app=query_spec.app) |
| p_ranges = [copy.copy(p_range) for _ in ns_ranges] |
| |
| assert len(p_ranges) == len(ns_ranges) |
| |
| iters = [ |
| db_iters.RangeIteratorFactory.create_property_range_iterator( |
| p, ns, query_spec) for p, ns in zip(p_ranges, ns_ranges)] |
| return [cls(i) for i in iters] |
| |
| |
| class DatastoreKeyInputReader(RawDatastoreInputReader): |
| """Iterate over an entity kind and yields datastore.Key.""" |
| |
| _KEY_RANGE_ITER_CLS = db_iters.KeyRangeKeyIterator |
| |
| |
| |
| DatastoreEntityInputReader = RawDatastoreInputReader |
| |
| |
| |
| |
| class _OldAbstractDatastoreInputReader(InputReader): |
| """Abstract base class for classes that iterate over datastore entities. |
| |
| Concrete subclasses must implement _iter_key_range(self, k_range). See the |
| docstring for that method for details. |
| """ |
| |
| |
| _BATCH_SIZE = 50 |
| |
| |
| _MAX_SHARD_COUNT = 256 |
| |
| |
| _OVERSAMPLING_FACTOR = 32 |
| |
| |
| |
| |
| MAX_NAMESPACES_FOR_KEY_SHARD = 10 |
| |
| |
| ENTITY_KIND_PARAM = "entity_kind" |
| KEYS_ONLY_PARAM = "keys_only" |
| BATCH_SIZE_PARAM = "batch_size" |
| KEY_RANGE_PARAM = "key_range" |
| NAMESPACE_RANGE_PARAM = "namespace_range" |
| CURRENT_KEY_RANGE_PARAM = "current_key_range" |
| FILTERS_PARAM = "filters" |
| |
| |
| |
| |
| |
| def __init__(self, |
| entity_kind, |
| key_ranges=None, |
| ns_range=None, |
| batch_size=_BATCH_SIZE, |
| current_key_range=None, |
| filters=None): |
| """Create new AbstractDatastoreInputReader object. |
| |
| This is internal constructor. Use split_query in a concrete class instead. |
| |
| Args: |
| entity_kind: entity kind as string. |
| key_ranges: a sequence of key_range.KeyRange instances to process. Only |
| one of key_ranges or ns_range can be non-None. |
| ns_range: a namespace_range.NamespaceRange to process. Only one of |
| key_ranges or ns_range can be non-None. |
| batch_size: size of read batch as int. |
| current_key_range: the current key_range.KeyRange being processed. |
| filters: optional list of filters to apply to the query. Each filter is |
| a tuple: (<property_name_as_str>, <query_operation_as_str>, <value>). |
| User filters are applied first. |
| """ |
| assert key_ranges is not None or ns_range is not None, ( |
| "must specify one of 'key_ranges' or 'ns_range'") |
| assert key_ranges is None or ns_range is None, ( |
| "can't specify both 'key_ranges ' and 'ns_range'") |
| |
| self._entity_kind = entity_kind |
| |
| |
| self._key_ranges = key_ranges and list(reversed(key_ranges)) |
| |
| self._ns_range = ns_range |
| self._batch_size = int(batch_size) |
| self._current_key_range = current_key_range |
| self._filters = filters |
| |
| @classmethod |
| def _get_raw_entity_kind(cls, entity_kind): |
| if "." in entity_kind: |
| logging.warning( |
| ". detected in entity kind %s specified for reader %s." |
| "Assuming entity kind contains the dot.", |
| entity_kind, cls.__name__) |
| return entity_kind |
| |
| def __iter__(self): |
| """Iterates over the given KeyRanges or NamespaceRange. |
| |
| This method iterates over the given KeyRanges or NamespaceRange and sets |
| the self._current_key_range to the KeyRange currently being processed. It |
| then delegates to the _iter_key_range method to yield that actual |
| results. |
| |
| Yields: |
| Forwards the objects yielded by the subclasses concrete _iter_key_range() |
| method. The caller must consume the result yielded because self.to_json() |
| will not include it. |
| """ |
| if self._key_ranges is not None: |
| for o in self._iter_key_ranges(): |
| yield o |
| elif self._ns_range is not None: |
| for o in self._iter_ns_range(): |
| yield o |
| else: |
| assert False, "self._key_ranges and self._ns_range are both None" |
| |
| def _iter_key_ranges(self): |
| """Iterates over self._key_ranges, delegating to self._iter_key_range().""" |
| while True: |
| if self._current_key_range is None: |
| if self._key_ranges: |
| self._current_key_range = self._key_ranges.pop() |
| |
| |
| continue |
| else: |
| break |
| |
| for key, o in self._iter_key_range( |
| copy.deepcopy(self._current_key_range)): |
| |
| |
| self._current_key_range.advance(key) |
| yield o |
| self._current_key_range = None |
| |
| def _iter_ns_range(self): |
| """Iterates over self._ns_range, delegating to self._iter_key_range().""" |
| while True: |
| if self._current_key_range is None: |
| query = self._ns_range.make_datastore_query() |
| namespace_result = query.Get(1) |
| if not namespace_result: |
| break |
| |
| namespace = namespace_result[0].name() or "" |
| self._current_key_range = key_range.KeyRange( |
| namespace=namespace, _app=self._ns_range.app) |
| yield ALLOW_CHECKPOINT |
| |
| for key, o in self._iter_key_range( |
| copy.deepcopy(self._current_key_range)): |
| |
| |
| self._current_key_range.advance(key) |
| yield o |
| |
| if (self._ns_range.is_single_namespace or |
| self._current_key_range.namespace == self._ns_range.namespace_end): |
| break |
| self._ns_range = self._ns_range.with_start_after( |
| self._current_key_range.namespace) |
| self._current_key_range = None |
| |
| def _iter_key_range(self, k_range): |
| """Yields a db.Key and the value that should be yielded by self.__iter__(). |
| |
| Args: |
| k_range: The key_range.KeyRange to iterate over. |
| |
| Yields: |
| A 2-tuple containing the last db.Key processed and the value that should |
| be yielded by __iter__. The returned db.Key will be used to determine the |
| InputReader's current position in self._current_key_range. |
| """ |
| raise NotImplementedError("_iter_key_range() not implemented in %s" % |
| self.__class__) |
| |
| def __str__(self): |
| """Returns the string representation of this InputReader.""" |
| if self._ns_range is None: |
| return repr(self._key_ranges) |
| else: |
| return repr(self._ns_range) |
| |
| @classmethod |
| def _choose_split_points(cls, sorted_keys, shard_count): |
| """Returns the best split points given a random set of db.Keys.""" |
| assert len(sorted_keys) >= shard_count |
| index_stride = len(sorted_keys) / float(shard_count) |
| return [sorted_keys[int(round(index_stride * i))] |
| for i in range(1, shard_count)] |
| |
| |
| |
| @classmethod |
| def _split_input_from_namespace(cls, app, namespace, entity_kind, |
| shard_count): |
| """Helper for _split_input_from_params. |
| |
| If there are not enough Entities to make all of the given shards, the |
| returned list of KeyRanges will include Nones. The returned list will |
| contain KeyRanges ordered lexographically with any Nones appearing at the |
| end. |
| |
| Args: |
| app: the app. |
| namespace: the namespace. |
| entity_kind: entity kind as string. |
| shard_count: the number of shards. |
| |
| Returns: |
| KeyRange objects. |
| """ |
| |
| raw_entity_kind = cls._get_raw_entity_kind(entity_kind) |
| if shard_count == 1: |
| |
| return [key_range.KeyRange(namespace=namespace, _app=app)] |
| |
| ds_query = datastore.Query(kind=raw_entity_kind, |
| namespace=namespace, |
| _app=app, |
| keys_only=True) |
| ds_query.Order("__scatter__") |
| random_keys = ds_query.Get(shard_count * cls._OVERSAMPLING_FACTOR) |
| |
| if not random_keys: |
| |
| |
| return ([key_range.KeyRange(namespace=namespace, _app=app)] + |
| [None] * (shard_count - 1)) |
| |
| random_keys.sort() |
| |
| if len(random_keys) >= shard_count: |
| |
| random_keys = cls._choose_split_points(random_keys, shard_count) |
| |
| |
| key_ranges = [] |
| |
| key_ranges.append(key_range.KeyRange( |
| key_start=None, |
| key_end=random_keys[0], |
| direction=key_range.KeyRange.ASC, |
| include_start=False, |
| include_end=False, |
| namespace=namespace, |
| _app=app)) |
| |
| for i in range(0, len(random_keys) - 1): |
| key_ranges.append(key_range.KeyRange( |
| key_start=random_keys[i], |
| key_end=random_keys[i+1], |
| direction=key_range.KeyRange.ASC, |
| include_start=True, |
| include_end=False, |
| namespace=namespace, |
| _app=app)) |
| |
| key_ranges.append(key_range.KeyRange( |
| key_start=random_keys[-1], |
| key_end=None, |
| direction=key_range.KeyRange.ASC, |
| include_start=True, |
| include_end=False, |
| namespace=namespace, |
| _app=app)) |
| |
| if len(key_ranges) < shard_count: |
| |
| key_ranges += [None] * (shard_count - len(key_ranges)) |
| |
| return key_ranges |
| |
| @classmethod |
| def _split_input_from_params(cls, app, namespaces, entity_kind_name, |
| params, shard_count): |
| """Return input reader objects. Helper for split_input.""" |
| |
| key_ranges = [] |
| for namespace in namespaces: |
| key_ranges.extend( |
| cls._split_input_from_namespace(app, |
| namespace, |
| entity_kind_name, |
| shard_count)) |
| |
| |
| |
| |
| shared_ranges = [[] for _ in range(shard_count)] |
| for i, k_range in enumerate(key_ranges): |
| shared_ranges[i % shard_count].append(k_range) |
| batch_size = int(params.get(cls.BATCH_SIZE_PARAM, cls._BATCH_SIZE)) |
| |
| return [cls(entity_kind_name, |
| key_ranges=key_ranges, |
| ns_range=None, |
| batch_size=batch_size) |
| for key_ranges in shared_ranges if key_ranges] |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Validates mapper spec and all mapper parameters. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Raises: |
| BadReaderParamsError: required parameters are missing or invalid. |
| """ |
| if mapper_spec.input_reader_class() != cls: |
| raise BadReaderParamsError("Input reader class mismatch") |
| params = _get_params(mapper_spec) |
| if cls.ENTITY_KIND_PARAM not in params: |
| raise BadReaderParamsError("Missing mapper parameter 'entity_kind'") |
| if cls.BATCH_SIZE_PARAM in params: |
| try: |
| batch_size = int(params[cls.BATCH_SIZE_PARAM]) |
| if batch_size < 1: |
| raise BadReaderParamsError("Bad batch size: %s" % batch_size) |
| except ValueError, e: |
| raise BadReaderParamsError("Bad batch size: %s" % e) |
| if cls.NAMESPACE_PARAM in params: |
| if not isinstance(params[cls.NAMESPACE_PARAM], |
| (str, unicode, type(None))): |
| raise BadReaderParamsError( |
| "Expected a single namespace string") |
| if cls.NAMESPACES_PARAM in params: |
| raise BadReaderParamsError("Multiple namespaces are no longer supported") |
| if cls.FILTERS_PARAM in params: |
| filters = params[cls.FILTERS_PARAM] |
| if not isinstance(filters, list): |
| raise BadReaderParamsError("Expected list for filters parameter") |
| for f in filters: |
| if not isinstance(f, (tuple, list)): |
| raise BadReaderParamsError("Filter should be a tuple or list: %s", f) |
| if len(f) != 3: |
| raise BadReaderParamsError("Filter should be a 3-tuple: %s", f) |
| if not isinstance(f[0], basestring): |
| raise BadReaderParamsError("First element should be string: %s", f) |
| if f[1] != "=": |
| raise BadReaderParamsError( |
| "Only equality filters are supported: %s", f) |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Splits query into shards without fetching query results. |
| |
| Tries as best as it can to split the whole query result set into equal |
| shards. Due to difficulty of making the perfect split, resulting shards' |
| sizes might differ significantly from each other. |
| |
| Args: |
| mapper_spec: MapperSpec with params containing 'entity_kind'. |
| May have 'namespace' in the params as a string containing a single |
| namespace. If specified then the input reader will only yield values |
| in the given namespace. If 'namespace' is not given then values from |
| all namespaces will be yielded. May also have 'batch_size' in the params |
| to specify the number of entities to process in each batch. |
| |
| Returns: |
| A list of InputReader objects. If the query results are empty then the |
| empty list will be returned. Otherwise, the list will always have a length |
| equal to number_of_shards but may be padded with Nones if there are too |
| few results for effective sharding. |
| """ |
| params = _get_params(mapper_spec) |
| entity_kind_name = params[cls.ENTITY_KIND_PARAM] |
| batch_size = int(params.get(cls.BATCH_SIZE_PARAM, cls._BATCH_SIZE)) |
| shard_count = mapper_spec.shard_count |
| namespace = params.get(cls.NAMESPACE_PARAM) |
| app = params.get(cls._APP_PARAM) |
| filters = params.get(cls.FILTERS_PARAM) |
| |
| if namespace is None: |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| namespace_query = datastore.Query("__namespace__", |
| keys_only=True, |
| _app=app) |
| namespace_keys = namespace_query.Get( |
| limit=cls.MAX_NAMESPACES_FOR_KEY_SHARD+1) |
| |
| if len(namespace_keys) > cls.MAX_NAMESPACES_FOR_KEY_SHARD: |
| ns_ranges = namespace_range.NamespaceRange.split(n=shard_count, |
| contiguous=True, |
| _app=app) |
| return [cls(entity_kind_name, |
| key_ranges=None, |
| ns_range=ns_range, |
| batch_size=batch_size, |
| filters=filters) |
| for ns_range in ns_ranges] |
| elif not namespace_keys: |
| return [cls(entity_kind_name, |
| key_ranges=None, |
| ns_range=namespace_range.NamespaceRange(_app=app), |
| batch_size=shard_count, |
| filters=filters)] |
| else: |
| namespaces = [namespace_key.name() or "" |
| for namespace_key in namespace_keys] |
| else: |
| namespaces = [namespace] |
| |
| readers = cls._split_input_from_params( |
| app, namespaces, entity_kind_name, params, shard_count) |
| if filters: |
| for reader in readers: |
| reader._filters = filters |
| return readers |
| |
| def to_json(self): |
| """Serializes all the data in this query range into json form. |
| |
| Returns: |
| all the data in json-compatible map. |
| """ |
| if self._key_ranges is None: |
| key_ranges_json = None |
| else: |
| key_ranges_json = [] |
| for k in self._key_ranges: |
| if k: |
| key_ranges_json.append(k.to_json()) |
| else: |
| key_ranges_json.append(None) |
| |
| if self._ns_range is None: |
| namespace_range_json = None |
| else: |
| namespace_range_json = self._ns_range.to_json_object() |
| |
| if self._current_key_range is None: |
| current_key_range_json = None |
| else: |
| current_key_range_json = self._current_key_range.to_json() |
| |
| json_dict = {self.KEY_RANGE_PARAM: key_ranges_json, |
| self.NAMESPACE_RANGE_PARAM: namespace_range_json, |
| self.CURRENT_KEY_RANGE_PARAM: current_key_range_json, |
| self.ENTITY_KIND_PARAM: self._entity_kind, |
| self.BATCH_SIZE_PARAM: self._batch_size, |
| self.FILTERS_PARAM: self._filters} |
| return json_dict |
| |
| @classmethod |
| def from_json(cls, json): |
| """Create new DatastoreInputReader from the json, encoded by to_json. |
| |
| Args: |
| json: json map representation of DatastoreInputReader. |
| |
| Returns: |
| an instance of DatastoreInputReader with all data deserialized from json. |
| """ |
| if json[cls.KEY_RANGE_PARAM] is None: |
| |
| key_ranges = None |
| else: |
| key_ranges = [] |
| for k in json[cls.KEY_RANGE_PARAM]: |
| if k: |
| key_ranges.append(key_range.KeyRange.from_json(k)) |
| else: |
| key_ranges.append(None) |
| |
| if json[cls.NAMESPACE_RANGE_PARAM] is None: |
| ns_range = None |
| else: |
| ns_range = namespace_range.NamespaceRange.from_json_object( |
| json[cls.NAMESPACE_RANGE_PARAM]) |
| |
| if json[cls.CURRENT_KEY_RANGE_PARAM] is None: |
| current_key_range = None |
| else: |
| current_key_range = key_range.KeyRange.from_json( |
| json[cls.CURRENT_KEY_RANGE_PARAM]) |
| |
| return cls( |
| json[cls.ENTITY_KIND_PARAM], |
| key_ranges, |
| ns_range, |
| json[cls.BATCH_SIZE_PARAM], |
| current_key_range, |
| filters=json.get(cls.FILTERS_PARAM)) |
| |
| |
| class BlobstoreLineInputReader(InputReader): |
| """Input reader for a newline delimited blob in Blobstore.""" |
| |
| |
| _BLOB_BUFFER_SIZE = 64000 |
| |
| |
| _MAX_SHARD_COUNT = 256 |
| |
| |
| _MAX_BLOB_KEYS_COUNT = 246 |
| |
| |
| BLOB_KEYS_PARAM = "blob_keys" |
| |
| |
| INITIAL_POSITION_PARAM = "initial_position" |
| END_POSITION_PARAM = "end_position" |
| BLOB_KEY_PARAM = "blob_key" |
| |
| def __init__(self, blob_key, start_position, end_position): |
| """Initializes this instance with the given blob key and character range. |
| |
| This BlobstoreInputReader will read from the first record starting after |
| strictly after start_position until the first record ending at or after |
| end_position (exclusive). As an exception, if start_position is 0, then |
| this InputReader starts reading at the first record. |
| |
| Args: |
| blob_key: the BlobKey that this input reader is processing. |
| start_position: the position to start reading at. |
| end_position: a position in the last record to read. |
| """ |
| self._blob_key = blob_key |
| self._blob_reader = blobstore.BlobReader(blob_key, |
| self._BLOB_BUFFER_SIZE, |
| start_position) |
| self._end_position = end_position |
| self._has_iterated = False |
| self._read_before_start = bool(start_position) |
| |
| def next(self): |
| """Returns the next input from as an (offset, line) tuple.""" |
| self._has_iterated = True |
| |
| if self._read_before_start: |
| self._blob_reader.readline() |
| self._read_before_start = False |
| start_position = self._blob_reader.tell() |
| |
| if start_position > self._end_position: |
| raise StopIteration() |
| |
| line = self._blob_reader.readline() |
| |
| if not line: |
| raise StopIteration() |
| |
| return start_position, line.rstrip("\n") |
| |
| def to_json(self): |
| """Returns an json-compatible input shard spec for remaining inputs.""" |
| new_pos = self._blob_reader.tell() |
| if self._has_iterated: |
| new_pos -= 1 |
| return {self.BLOB_KEY_PARAM: self._blob_key, |
| self.INITIAL_POSITION_PARAM: new_pos, |
| self.END_POSITION_PARAM: self._end_position} |
| |
| def __str__(self): |
| """Returns the string representation of this BlobstoreLineInputReader.""" |
| return "blobstore.BlobKey(%r):[%d, %d]" % ( |
| self._blob_key, self._blob_reader.tell(), self._end_position) |
| |
| @classmethod |
| def from_json(cls, json): |
| """Instantiates an instance of this InputReader for the given shard spec.""" |
| return cls(json[cls.BLOB_KEY_PARAM], |
| json[cls.INITIAL_POSITION_PARAM], |
| json[cls.END_POSITION_PARAM]) |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Validates mapper spec and all mapper parameters. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Raises: |
| BadReaderParamsError: required parameters are missing or invalid. |
| """ |
| if mapper_spec.input_reader_class() != cls: |
| raise BadReaderParamsError("Mapper input reader class mismatch") |
| params = _get_params(mapper_spec) |
| if cls.BLOB_KEYS_PARAM not in params: |
| raise BadReaderParamsError("Must specify 'blob_keys' for mapper input") |
| blob_keys = params[cls.BLOB_KEYS_PARAM] |
| if isinstance(blob_keys, basestring): |
| |
| |
| blob_keys = blob_keys.split(",") |
| if len(blob_keys) > cls._MAX_BLOB_KEYS_COUNT: |
| raise BadReaderParamsError("Too many 'blob_keys' for mapper input") |
| if not blob_keys: |
| raise BadReaderParamsError("No 'blob_keys' specified for mapper input") |
| for blob_key in blob_keys: |
| blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key)) |
| if not blob_info: |
| raise BadReaderParamsError("Could not find blobinfo for key %s" % |
| blob_key) |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Returns a list of shard_count input_spec_shards for input_spec. |
| |
| Args: |
| mapper_spec: The mapper specification to split from. Must contain |
| 'blob_keys' parameter with one or more blob keys. |
| |
| Returns: |
| A list of BlobstoreInputReaders corresponding to the specified shards. |
| """ |
| params = _get_params(mapper_spec) |
| blob_keys = params[cls.BLOB_KEYS_PARAM] |
| if isinstance(blob_keys, basestring): |
| |
| |
| blob_keys = blob_keys.split(",") |
| |
| blob_sizes = {} |
| for blob_key in blob_keys: |
| blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key)) |
| blob_sizes[blob_key] = blob_info.size |
| |
| shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count) |
| shards_per_blob = shard_count // len(blob_keys) |
| if shards_per_blob == 0: |
| shards_per_blob = 1 |
| |
| chunks = [] |
| for blob_key, blob_size in blob_sizes.items(): |
| blob_chunk_size = blob_size // shards_per_blob |
| for i in xrange(shards_per_blob - 1): |
| chunks.append(BlobstoreLineInputReader.from_json( |
| {cls.BLOB_KEY_PARAM: blob_key, |
| cls.INITIAL_POSITION_PARAM: blob_chunk_size * i, |
| cls.END_POSITION_PARAM: blob_chunk_size * (i + 1)})) |
| chunks.append(BlobstoreLineInputReader.from_json( |
| {cls.BLOB_KEY_PARAM: blob_key, |
| cls.INITIAL_POSITION_PARAM: blob_chunk_size * (shards_per_blob - 1), |
| cls.END_POSITION_PARAM: blob_size})) |
| return chunks |
| |
| |
| class BlobstoreZipInputReader(InputReader): |
| """Input reader for files from a zip archive stored in the Blobstore. |
| |
| Each instance of the reader will read the TOC, from the end of the zip file, |
| and then only the contained files which it is responsible for. |
| """ |
| |
| |
| _MAX_SHARD_COUNT = 256 |
| |
| |
| BLOB_KEY_PARAM = "blob_key" |
| START_INDEX_PARAM = "start_index" |
| END_INDEX_PARAM = "end_index" |
| |
| def __init__(self, blob_key, start_index, end_index, |
| _reader=blobstore.BlobReader): |
| """Initializes this instance with the given blob key and file range. |
| |
| This BlobstoreZipInputReader will read from the file with index start_index |
| up to but not including the file with index end_index. |
| |
| Args: |
| blob_key: the BlobKey that this input reader is processing. |
| start_index: the index of the first file to read. |
| end_index: the index of the first file that will not be read. |
| _reader: a callable that returns a file-like object for reading blobs. |
| Used for dependency injection. |
| """ |
| self._blob_key = blob_key |
| self._start_index = start_index |
| self._end_index = end_index |
| self._reader = _reader |
| self._zip = None |
| self._entries = None |
| |
| def next(self): |
| """Returns the next input from this input reader as (ZipInfo, opener) tuple. |
| |
| Returns: |
| The next input from this input reader, in the form of a 2-tuple. |
| The first element of the tuple is a zipfile.ZipInfo object. |
| The second element of the tuple is a zero-argument function that, when |
| called, returns the complete body of the file. |
| """ |
| if not self._zip: |
| self._zip = zipfile.ZipFile(self._reader(self._blob_key)) |
| |
| self._entries = self._zip.infolist()[self._start_index:self._end_index] |
| self._entries.reverse() |
| if not self._entries: |
| raise StopIteration() |
| entry = self._entries.pop() |
| self._start_index += 1 |
| return (entry, lambda: self._read(entry)) |
| |
| def _read(self, entry): |
| """Read entry content. |
| |
| Args: |
| entry: zip file entry as zipfile.ZipInfo. |
| Returns: |
| Entry content as string. |
| """ |
| start_time = time.time() |
| content = self._zip.read(entry.filename) |
| |
| ctx = context.get() |
| if ctx: |
| operation.counters.Increment(COUNTER_IO_READ_BYTES, len(content))(ctx) |
| operation.counters.Increment( |
| COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx) |
| |
| return content |
| |
| @classmethod |
| def from_json(cls, json): |
| """Creates an instance of the InputReader for the given input shard state. |
| |
| Args: |
| json: The InputReader state as a dict-like object. |
| |
| Returns: |
| An instance of the InputReader configured using the values of json. |
| """ |
| return cls(json[cls.BLOB_KEY_PARAM], |
| json[cls.START_INDEX_PARAM], |
| json[cls.END_INDEX_PARAM]) |
| |
| def to_json(self): |
| """Returns an input shard state for the remaining inputs. |
| |
| Returns: |
| A json-izable version of the remaining InputReader. |
| """ |
| return {self.BLOB_KEY_PARAM: self._blob_key, |
| self.START_INDEX_PARAM: self._start_index, |
| self.END_INDEX_PARAM: self._end_index} |
| |
| def __str__(self): |
| """Returns the string representation of this BlobstoreZipInputReader.""" |
| return "blobstore.BlobKey(%r):[%d, %d]" % ( |
| self._blob_key, self._start_index, self._end_index) |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Validates mapper spec and all mapper parameters. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Raises: |
| BadReaderParamsError: required parameters are missing or invalid. |
| """ |
| if mapper_spec.input_reader_class() != cls: |
| raise BadReaderParamsError("Mapper input reader class mismatch") |
| params = _get_params(mapper_spec) |
| if cls.BLOB_KEY_PARAM not in params: |
| raise BadReaderParamsError("Must specify 'blob_key' for mapper input") |
| blob_key = params[cls.BLOB_KEY_PARAM] |
| blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key)) |
| if not blob_info: |
| raise BadReaderParamsError("Could not find blobinfo for key %s" % |
| blob_key) |
| |
| @classmethod |
| def split_input(cls, mapper_spec, _reader=blobstore.BlobReader): |
| """Returns a list of input shard states for the input spec. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. Must contain |
| 'blob_key' parameter with one blob key. |
| _reader: a callable that returns a file-like object for reading blobs. |
| Used for dependency injection. |
| |
| Returns: |
| A list of InputReaders spanning files within the zip. |
| """ |
| params = _get_params(mapper_spec) |
| blob_key = params[cls.BLOB_KEY_PARAM] |
| zip_input = zipfile.ZipFile(_reader(blob_key)) |
| zfiles = zip_input.infolist() |
| total_size = sum(x.file_size for x in zfiles) |
| num_shards = min(mapper_spec.shard_count, cls._MAX_SHARD_COUNT) |
| size_per_shard = total_size // num_shards |
| |
| |
| |
| shard_start_indexes = [0] |
| current_shard_size = 0 |
| for i, fileinfo in enumerate(zfiles): |
| current_shard_size += fileinfo.file_size |
| if current_shard_size >= size_per_shard: |
| shard_start_indexes.append(i + 1) |
| current_shard_size = 0 |
| |
| if shard_start_indexes[-1] != len(zfiles): |
| shard_start_indexes.append(len(zfiles)) |
| |
| return [cls(blob_key, start_index, end_index, _reader) |
| for start_index, end_index |
| in zip(shard_start_indexes, shard_start_indexes[1:])] |
| |
| |
| class BlobstoreZipLineInputReader(InputReader): |
| """Input reader for newline delimited files in zip archives from Blobstore. |
| |
| This has the same external interface as the BlobstoreLineInputReader, in that |
| it takes a list of blobs as its input and yields lines to the reader. |
| However the blobs themselves are expected to be zip archives of line delimited |
| files instead of the files themselves. |
| |
| This is useful as many line delimited files gain greatly from compression. |
| """ |
| |
| |
| _MAX_SHARD_COUNT = 256 |
| |
| |
| _MAX_BLOB_KEYS_COUNT = 246 |
| |
| |
| BLOB_KEYS_PARAM = "blob_keys" |
| |
| |
| BLOB_KEY_PARAM = "blob_key" |
| START_FILE_INDEX_PARAM = "start_file_index" |
| END_FILE_INDEX_PARAM = "end_file_index" |
| OFFSET_PARAM = "offset" |
| |
| def __init__(self, blob_key, start_file_index, end_file_index, offset, |
| _reader=blobstore.BlobReader): |
| """Initializes this instance with the given blob key and file range. |
| |
| This BlobstoreZipLineInputReader will read from the file with index |
| start_file_index up to but not including the file with index end_file_index. |
| It will return lines starting at offset within file[start_file_index] |
| |
| Args: |
| blob_key: the BlobKey that this input reader is processing. |
| start_file_index: the index of the first file to read within the zip. |
| end_file_index: the index of the first file that will not be read. |
| offset: the byte offset within blob_key.zip[start_file_index] to start |
| reading. The reader will continue to the end of the file. |
| _reader: a callable that returns a file-like object for reading blobs. |
| Used for dependency injection. |
| """ |
| self._blob_key = blob_key |
| self._start_file_index = start_file_index |
| self._end_file_index = end_file_index |
| self._initial_offset = offset |
| self._reader = _reader |
| self._zip = None |
| self._entries = None |
| self._filestream = None |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Validates mapper spec and all mapper parameters. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Raises: |
| BadReaderParamsError: required parameters are missing or invalid. |
| """ |
| if mapper_spec.input_reader_class() != cls: |
| raise BadReaderParamsError("Mapper input reader class mismatch") |
| params = _get_params(mapper_spec) |
| if cls.BLOB_KEYS_PARAM not in params: |
| raise BadReaderParamsError("Must specify 'blob_keys' for mapper input") |
| |
| blob_keys = params[cls.BLOB_KEYS_PARAM] |
| if isinstance(blob_keys, basestring): |
| |
| |
| blob_keys = blob_keys.split(",") |
| if len(blob_keys) > cls._MAX_BLOB_KEYS_COUNT: |
| raise BadReaderParamsError("Too many 'blob_keys' for mapper input") |
| if not blob_keys: |
| raise BadReaderParamsError("No 'blob_keys' specified for mapper input") |
| for blob_key in blob_keys: |
| blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key)) |
| if not blob_info: |
| raise BadReaderParamsError("Could not find blobinfo for key %s" % |
| blob_key) |
| |
| @classmethod |
| def split_input(cls, mapper_spec, _reader=blobstore.BlobReader): |
| """Returns a list of input readers for the input spec. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. Must contain |
| 'blob_keys' parameter with one or more blob keys. |
| _reader: a callable that returns a file-like object for reading blobs. |
| Used for dependency injection. |
| |
| Returns: |
| A list of InputReaders spanning the subfiles within the blobs. |
| There will be at least one reader per blob, but it will otherwise |
| attempt to keep the expanded size even. |
| """ |
| params = _get_params(mapper_spec) |
| blob_keys = params[cls.BLOB_KEYS_PARAM] |
| if isinstance(blob_keys, basestring): |
| |
| |
| blob_keys = blob_keys.split(",") |
| |
| blob_files = {} |
| total_size = 0 |
| for blob_key in blob_keys: |
| zip_input = zipfile.ZipFile(_reader(blob_key)) |
| blob_files[blob_key] = zip_input.infolist() |
| total_size += sum(x.file_size for x in blob_files[blob_key]) |
| |
| shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count) |
| |
| |
| |
| |
| |
| size_per_shard = total_size // shard_count |
| |
| readers = [] |
| for blob_key in blob_keys: |
| bfiles = blob_files[blob_key] |
| current_shard_size = 0 |
| start_file_index = 0 |
| next_file_index = 0 |
| for fileinfo in bfiles: |
| next_file_index += 1 |
| current_shard_size += fileinfo.file_size |
| if current_shard_size >= size_per_shard: |
| readers.append(cls(blob_key, start_file_index, next_file_index, 0, |
| _reader)) |
| current_shard_size = 0 |
| start_file_index = next_file_index |
| if current_shard_size != 0: |
| readers.append(cls(blob_key, start_file_index, next_file_index, 0, |
| _reader)) |
| |
| return readers |
| |
| def next(self): |
| """Returns the next line from this input reader as (lineinfo, line) tuple. |
| |
| Returns: |
| The next input from this input reader, in the form of a 2-tuple. |
| The first element of the tuple describes the source, it is itself |
| a tuple (blobkey, filenumber, byteoffset). |
| The second element of the tuple is the line found at that offset. |
| """ |
| if not self._filestream: |
| if not self._zip: |
| self._zip = zipfile.ZipFile(self._reader(self._blob_key)) |
| |
| self._entries = self._zip.infolist()[self._start_file_index: |
| self._end_file_index] |
| self._entries.reverse() |
| if not self._entries: |
| raise StopIteration() |
| entry = self._entries.pop() |
| value = self._zip.read(entry.filename) |
| self._filestream = StringIO.StringIO(value) |
| if self._initial_offset: |
| self._filestream.seek(self._initial_offset) |
| self._filestream.readline() |
| |
| start_position = self._filestream.tell() |
| line = self._filestream.readline() |
| |
| if not line: |
| |
| self._filestream.close() |
| self._filestream = None |
| self._start_file_index += 1 |
| self._initial_offset = 0 |
| return self.next() |
| |
| return ((self._blob_key, self._start_file_index, start_position), |
| line.rstrip("\n")) |
| |
| def _next_offset(self): |
| """Return the offset of the next line to read.""" |
| if self._filestream: |
| offset = self._filestream.tell() |
| if offset: |
| offset -= 1 |
| else: |
| offset = self._initial_offset |
| |
| return offset |
| |
| def to_json(self): |
| """Returns an input shard state for the remaining inputs. |
| |
| Returns: |
| A json-izable version of the remaining InputReader. |
| """ |
| |
| return {self.BLOB_KEY_PARAM: self._blob_key, |
| self.START_FILE_INDEX_PARAM: self._start_file_index, |
| self.END_FILE_INDEX_PARAM: self._end_file_index, |
| self.OFFSET_PARAM: self._next_offset()} |
| |
| @classmethod |
| def from_json(cls, json, _reader=blobstore.BlobReader): |
| """Creates an instance of the InputReader for the given input shard state. |
| |
| Args: |
| json: The InputReader state as a dict-like object. |
| _reader: For dependency injection. |
| |
| Returns: |
| An instance of the InputReader configured using the values of json. |
| """ |
| return cls(json[cls.BLOB_KEY_PARAM], |
| json[cls.START_FILE_INDEX_PARAM], |
| json[cls.END_FILE_INDEX_PARAM], |
| json[cls.OFFSET_PARAM], |
| _reader) |
| |
| def __str__(self): |
| """Returns the string representation of this reader. |
| |
| Returns: |
| string blobkey:[start file num, end file num]:current offset. |
| """ |
| return "blobstore.BlobKey(%r):[%d, %d]:%d" % ( |
| self._blob_key, self._start_file_index, self._end_file_index, |
| self._next_offset()) |
| |
| |
| class RandomStringInputReader(InputReader): |
| """RandomStringInputReader generates random strings as output. |
| |
| Primary usage is to populate output with testing entries. |
| """ |
| |
| |
| COUNT = "count" |
| |
| STRING_LENGTH = "string_length" |
| |
| DEFAULT_STRING_LENGTH = 10 |
| |
| def __init__(self, count, string_length): |
| """Initialize input reader. |
| |
| Args: |
| count: number of entries this shard should generate. |
| string_length: the length of generated random strings. |
| """ |
| self._count = count |
| self._string_length = string_length |
| |
| def __iter__(self): |
| ctx = context.get() |
| |
| while self._count: |
| self._count -= 1 |
| start_time = time.time() |
| content = "".join(random.choice(string.ascii_lowercase) |
| for _ in range(self._string_length)) |
| if ctx: |
| operation.counters.Increment( |
| COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx) |
| operation.counters.Increment(COUNTER_IO_READ_BYTES, len(content))(ctx) |
| yield content |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| params = _get_params(mapper_spec) |
| count = params[cls.COUNT] |
| string_length = cls.DEFAULT_STRING_LENGTH |
| if cls.STRING_LENGTH in params: |
| string_length = params[cls.STRING_LENGTH] |
| |
| shard_count = mapper_spec.shard_count |
| count_per_shard = count // shard_count |
| |
| mr_input_readers = [ |
| cls(count_per_shard, string_length) for _ in range(shard_count)] |
| |
| left = count - count_per_shard*shard_count |
| if left > 0: |
| mr_input_readers.append(cls(left, string_length)) |
| |
| return mr_input_readers |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| if mapper_spec.input_reader_class() != cls: |
| raise BadReaderParamsError("Mapper input reader class mismatch") |
| |
| params = _get_params(mapper_spec) |
| if cls.COUNT not in params: |
| raise BadReaderParamsError("Must specify %s" % cls.COUNT) |
| if not isinstance(params[cls.COUNT], int): |
| raise BadReaderParamsError("%s should be an int but is %s" % |
| (cls.COUNT, type(params[cls.COUNT]))) |
| if params[cls.COUNT] <= 0: |
| raise BadReaderParamsError("%s should be a positive int") |
| if cls.STRING_LENGTH in params and not ( |
| isinstance(params[cls.STRING_LENGTH], int) and |
| params[cls.STRING_LENGTH] > 0): |
| raise BadReaderParamsError("%s should be a positive int but is %s" % |
| (cls.STRING_LENGTH, params[cls.STRING_LENGTH])) |
| if (not isinstance(mapper_spec.shard_count, int) or |
| mapper_spec.shard_count <= 0): |
| raise BadReaderParamsError( |
| "shard_count should be a positive int but is %s" % |
| mapper_spec.shard_count) |
| |
| @classmethod |
| def from_json(cls, json): |
| return cls(json[cls.COUNT], json[cls.STRING_LENGTH]) |
| |
| def to_json(self): |
| return {self.COUNT: self._count, self.STRING_LENGTH: self._string_length} |
| |
| |
| |
| |
| |
| |
| |
| |
| class NamespaceInputReader(InputReader): |
| """An input reader to iterate over namespaces. |
| |
| This reader yields namespace names as string. |
| It will always produce only one shard. |
| """ |
| |
| NAMESPACE_RANGE_PARAM = "namespace_range" |
| BATCH_SIZE_PARAM = "batch_size" |
| _BATCH_SIZE = 10 |
| |
| def __init__(self, ns_range, batch_size=_BATCH_SIZE): |
| self.ns_range = ns_range |
| self._batch_size = batch_size |
| |
| def to_json(self): |
| """Serializes all the data in this query range into json form. |
| |
| Returns: |
| all the data in json-compatible map. |
| """ |
| return {self.NAMESPACE_RANGE_PARAM: self.ns_range.to_json_object(), |
| self.BATCH_SIZE_PARAM: self._batch_size} |
| |
| @classmethod |
| def from_json(cls, json): |
| """Create new DatastoreInputReader from the json, encoded by to_json. |
| |
| Args: |
| json: json map representation of DatastoreInputReader. |
| |
| Returns: |
| an instance of DatastoreInputReader with all data deserialized from json. |
| """ |
| return cls( |
| namespace_range.NamespaceRange.from_json_object( |
| json[cls.NAMESPACE_RANGE_PARAM]), |
| json[cls.BATCH_SIZE_PARAM]) |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Validates mapper spec. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Raises: |
| BadReaderParamsError: required parameters are missing or invalid. |
| """ |
| if mapper_spec.input_reader_class() != cls: |
| raise BadReaderParamsError("Input reader class mismatch") |
| params = _get_params(mapper_spec) |
| if cls.BATCH_SIZE_PARAM in params: |
| try: |
| batch_size = int(params[cls.BATCH_SIZE_PARAM]) |
| if batch_size < 1: |
| raise BadReaderParamsError("Bad batch size: %s" % batch_size) |
| except ValueError, e: |
| raise BadReaderParamsError("Bad batch size: %s" % e) |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Returns a list of input readers for the input spec. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Returns: |
| A list of InputReaders. |
| """ |
| batch_size = int(_get_params(mapper_spec).get( |
| cls.BATCH_SIZE_PARAM, cls._BATCH_SIZE)) |
| shard_count = mapper_spec.shard_count |
| namespace_ranges = namespace_range.NamespaceRange.split(shard_count, |
| contiguous=True) |
| return [NamespaceInputReader(ns_range, batch_size) |
| for ns_range in namespace_ranges] |
| |
| def __iter__(self): |
| while True: |
| keys = self.ns_range.make_datastore_query().Get(limit=self._batch_size) |
| if not keys: |
| break |
| |
| for key in keys: |
| namespace = metadata.Namespace.key_to_namespace(key) |
| self.ns_range = self.ns_range.with_start_after(namespace) |
| yield namespace |
| |
| def __str__(self): |
| return repr(self.ns_range) |
| |
| |
| class RecordsReader(InputReader): |
| """Reader to read a list of Files API file in records format. |
| |
| The number of input shards can be specified by the SHARDS_PARAM |
| mapper parameter. Input files cannot be split, so there will be at most |
| one shard per file. Also the number of shards will not be reduced based on |
| the number of input files, so shards in always equals shards out. |
| """ |
| |
| FILE_PARAM = "file" |
| FILES_PARAM = "files" |
| |
| def __init__(self, filenames, position): |
| """Constructor. |
| |
| Args: |
| filenames: list of filenames. |
| position: file position to start reading from as int. |
| """ |
| self._filenames = filenames |
| if self._filenames: |
| self._reader = records.RecordsReader( |
| files.BufferedFile(self._filenames[0])) |
| self._reader.seek(position) |
| else: |
| self._reader = None |
| |
| def __iter__(self): |
| """Iterate over records in file. |
| |
| Yields: |
| Records as strings. |
| """ |
| ctx = context.get() |
| |
| while self._reader: |
| try: |
| start_time = time.time() |
| record = self._reader.read() |
| if ctx: |
| operation.counters.Increment( |
| COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx) |
| operation.counters.Increment(COUNTER_IO_READ_BYTES, len(record))(ctx) |
| yield record |
| except (files.ExistenceError), e: |
| raise errors.FailJobError("ExistenceError: %s" % e) |
| except (files.UnknownError), e: |
| raise errors.RetrySliceError("UnknownError: %s" % e) |
| except EOFError: |
| self._filenames.pop(0) |
| if not self._filenames: |
| self._reader = None |
| else: |
| self._reader = records.RecordsReader( |
| files.BufferedFile(self._filenames[0])) |
| |
| @classmethod |
| def from_json(cls, json): |
| """Creates an instance of the InputReader for the given input shard state. |
| |
| Args: |
| json: The InputReader state as a dict-like object. |
| |
| Returns: |
| An instance of the InputReader configured using the values of json. |
| """ |
| return cls(json["filenames"], json["position"]) |
| |
| def to_json(self): |
| """Returns an input shard state for the remaining inputs. |
| |
| Returns: |
| A json-izable version of the remaining InputReader. |
| """ |
| result = { |
| "filenames": self._filenames, |
| "position": 0, |
| } |
| if self._reader: |
| result["position"] = self._reader.tell() |
| return result |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Returns a list of input readers for the input spec. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Returns: |
| A list of InputReaders. |
| """ |
| params = _get_params(mapper_spec) |
| shard_count = mapper_spec.shard_count |
| |
| if cls.FILES_PARAM in params: |
| filenames = params[cls.FILES_PARAM] |
| if isinstance(filenames, basestring): |
| filenames = filenames.split(",") |
| else: |
| filenames = [params[cls.FILE_PARAM]] |
| |
| batch_list = [[] for _ in xrange(shard_count)] |
| for index, _ in enumerate(filenames): |
| |
| batch_list[index % shard_count].append(filenames[index]) |
| |
| |
| batch_list.sort(reverse=True, key=len) |
| return [cls(batch, 0) for batch in batch_list] |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Validates mapper spec and all mapper parameters. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Raises: |
| BadReaderParamsError: required parameters are missing or invalid. |
| """ |
| if mapper_spec.input_reader_class() != cls: |
| raise errors.BadReaderParamsError("Input reader class mismatch") |
| params = _get_params(mapper_spec) |
| if (cls.FILES_PARAM not in params and |
| cls.FILE_PARAM not in params): |
| raise BadReaderParamsError( |
| "Must specify '%s' or '%s' parameter for mapper input" % |
| (cls.FILES_PARAM, cls.FILE_PARAM)) |
| |
| def __str__(self): |
| position = 0 |
| if self._reader: |
| position = self._reader.tell() |
| return "%s:%s" % (self._filenames, position) |
| |
| |
| class LogInputReader(InputReader): |
| """Input reader for a time range of logs via the Logs Reader API. |
| |
| The number of input shards may be specified by the SHARDS_PARAM mapper |
| parameter. A starting and ending time (in seconds since the Unix epoch) are |
| required to generate time ranges over which to shard the input. |
| """ |
| |
| START_TIME_PARAM = "start_time" |
| END_TIME_PARAM = "end_time" |
| MINIMUM_LOG_LEVEL_PARAM = "minimum_log_level" |
| INCLUDE_INCOMPLETE_PARAM = "include_incomplete" |
| INCLUDE_APP_LOGS_PARAM = "include_app_logs" |
| VERSION_IDS_PARAM = "version_ids" |
| MODULE_VERSIONS_PARAM = "module_versions" |
| |
| |
| _OFFSET_PARAM = "offset" |
| _PROTOTYPE_REQUEST_PARAM = "prototype_request" |
| |
| _PARAMS = frozenset([START_TIME_PARAM, END_TIME_PARAM, _OFFSET_PARAM, |
| MINIMUM_LOG_LEVEL_PARAM, INCLUDE_INCOMPLETE_PARAM, |
| INCLUDE_APP_LOGS_PARAM, VERSION_IDS_PARAM, |
| MODULE_VERSIONS_PARAM, _PROTOTYPE_REQUEST_PARAM]) |
| _KWARGS = frozenset([_OFFSET_PARAM, _PROTOTYPE_REQUEST_PARAM]) |
| |
| def __init__(self, |
| start_time=None, |
| end_time=None, |
| minimum_log_level=None, |
| include_incomplete=False, |
| include_app_logs=False, |
| version_ids=None, |
| module_versions=None, |
| **kwargs): |
| """Constructor. |
| |
| Args: |
| start_time: The earliest request completion or last-update time of logs |
| that should be mapped over, in seconds since the Unix epoch. |
| end_time: The latest request completion or last-update time that logs |
| should be mapped over, in seconds since the Unix epoch. |
| minimum_log_level: An application log level which serves as a filter on |
| the requests mapped over--requests with no application log at or above |
| the specified level will be omitted, even if include_app_logs is False. |
| include_incomplete: Whether or not to include requests that have started |
| but not yet finished, as a boolean. Defaults to False. |
| include_app_logs: Whether or not to include application level logs in the |
| mapped logs, as a boolean. Defaults to False. |
| version_ids: A list of version ids whose logs should be read. This can not |
| be used with module_versions |
| module_versions: A list of tuples containing a module and version id |
| whose logs should be read. This can not be used with version_ids |
| **kwargs: A dictionary of keywords associated with this input reader. |
| """ |
| InputReader.__init__(self) |
| |
| |
| |
| self.__params = dict(kwargs) |
| |
| if start_time is not None: |
| self.__params[self.START_TIME_PARAM] = start_time |
| if end_time is not None: |
| self.__params[self.END_TIME_PARAM] = end_time |
| if minimum_log_level is not None: |
| self.__params[self.MINIMUM_LOG_LEVEL_PARAM] = minimum_log_level |
| if include_incomplete is not None: |
| self.__params[self.INCLUDE_INCOMPLETE_PARAM] = include_incomplete |
| if include_app_logs is not None: |
| self.__params[self.INCLUDE_APP_LOGS_PARAM] = include_app_logs |
| if version_ids: |
| self.__params[self.VERSION_IDS_PARAM] = version_ids |
| if module_versions: |
| self.__params[self.MODULE_VERSIONS_PARAM] = module_versions |
| |
| |
| if self._PROTOTYPE_REQUEST_PARAM in self.__params: |
| prototype_request = log_service_pb.LogReadRequest( |
| self.__params[self._PROTOTYPE_REQUEST_PARAM]) |
| self.__params[self._PROTOTYPE_REQUEST_PARAM] = prototype_request |
| |
| def __iter__(self): |
| """Iterates over logs in a given range of time. |
| |
| Yields: |
| A RequestLog containing all the information for a single request. |
| """ |
| for log in logservice.fetch(**self.__params): |
| self.__params[self._OFFSET_PARAM] = log.offset |
| yield log |
| |
| @classmethod |
| def from_json(cls, json): |
| """Creates an instance of the InputReader for the given input shard's state. |
| |
| Args: |
| json: The InputReader state as a dict-like object. |
| |
| Returns: |
| An instance of the InputReader configured using the given JSON parameters. |
| """ |
| |
| params = dict((str(k), v) for k, v in json.iteritems() |
| if k in cls._PARAMS) |
| |
| |
| |
| |
| if cls._OFFSET_PARAM in params: |
| params[cls._OFFSET_PARAM] = base64.b64decode(params[cls._OFFSET_PARAM]) |
| return cls(**params) |
| |
| def to_json(self): |
| """Returns an input shard state for the remaining inputs. |
| |
| Returns: |
| A JSON serializable version of the remaining input to read. |
| """ |
| |
| params = dict(self.__params) |
| if self._PROTOTYPE_REQUEST_PARAM in params: |
| prototype_request = params[self._PROTOTYPE_REQUEST_PARAM] |
| params[self._PROTOTYPE_REQUEST_PARAM] = prototype_request.Encode() |
| if self._OFFSET_PARAM in params: |
| params[self._OFFSET_PARAM] = base64.b64encode(params[self._OFFSET_PARAM]) |
| return params |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Returns a list of input readers for the given input specification. |
| |
| Args: |
| mapper_spec: The MapperSpec for this InputReader. |
| |
| Returns: |
| A list of InputReaders. |
| """ |
| params = _get_params(mapper_spec) |
| shard_count = mapper_spec.shard_count |
| |
| |
| start_time = params[cls.START_TIME_PARAM] |
| end_time = params[cls.END_TIME_PARAM] |
| seconds_per_shard = (end_time - start_time) / shard_count |
| |
| |
| shards = [] |
| for _ in xrange(shard_count - 1): |
| params[cls.END_TIME_PARAM] = (params[cls.START_TIME_PARAM] + |
| seconds_per_shard) |
| shards.append(LogInputReader(**params)) |
| params[cls.START_TIME_PARAM] = params[cls.END_TIME_PARAM] |
| |
| |
| params[cls.END_TIME_PARAM] = end_time |
| return shards + [LogInputReader(**params)] |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Validates the mapper's specification and all necessary parameters. |
| |
| Args: |
| mapper_spec: The MapperSpec to be used with this InputReader. |
| |
| Raises: |
| BadReaderParamsError: If the user fails to specify both a starting time |
| and an ending time, or if the starting time is later than the ending |
| time. |
| """ |
| if mapper_spec.input_reader_class() != cls: |
| raise errors.BadReaderParamsError("Input reader class mismatch") |
| |
| params = _get_params(mapper_spec, allowed_keys=cls._PARAMS) |
| if (cls.VERSION_IDS_PARAM not in params and |
| cls.MODULE_VERSIONS_PARAM not in params): |
| raise errors.BadReaderParamsError("Must specify a list of version ids or " |
| "module/version ids for mapper input") |
| if (cls.VERSION_IDS_PARAM in params and |
| cls.MODULE_VERSIONS_PARAM in params): |
| raise errors.BadReaderParamsError("Can not supply both version ids or " |
| "module/version ids. Use only one.") |
| if (cls.START_TIME_PARAM not in params or |
| params[cls.START_TIME_PARAM] is None): |
| raise errors.BadReaderParamsError("Must specify a starting time for " |
| "mapper input") |
| if cls.END_TIME_PARAM not in params or params[cls.END_TIME_PARAM] is None: |
| params[cls.END_TIME_PARAM] = time.time() |
| |
| if params[cls.START_TIME_PARAM] >= params[cls.END_TIME_PARAM]: |
| raise errors.BadReaderParamsError("The starting time cannot be later " |
| "than or the same as the ending time.") |
| |
| if cls._PROTOTYPE_REQUEST_PARAM in params: |
| try: |
| params[cls._PROTOTYPE_REQUEST_PARAM] = log_service_pb.LogReadRequest( |
| params[cls._PROTOTYPE_REQUEST_PARAM]) |
| except (TypeError, ProtocolBuffer.ProtocolBufferDecodeError): |
| raise errors.BadReaderParamsError("The prototype request must be " |
| "parseable as a LogReadRequest.") |
| |
| |
| |
| |
| try: |
| logservice.fetch(**params) |
| except logservice.InvalidArgumentError, e: |
| raise errors.BadReaderParamsError("One or more parameters are not valid " |
| "inputs to logservice.fetch(): %s" % e) |
| |
| def __str__(self): |
| """Returns the string representation of this LogInputReader.""" |
| params = [] |
| for key in sorted(self.__params.keys()): |
| value = self.__params[key] |
| if key is self._PROTOTYPE_REQUEST_PARAM: |
| params.append("%s='%s'" % (key, value)) |
| elif key is self._OFFSET_PARAM: |
| params.append("%s='%s'" % (key, value)) |
| else: |
| params.append("%s=%s" % (key, value)) |
| |
| return "LogInputReader(%s)" % ", ".join(params) |
| |
| |
| class _GoogleCloudStorageInputReader(InputReader): |
| """Input reader from Google Cloud Storage using the cloudstorage library. |
| |
| This class is expected to be subclassed with a reader that understands |
| user-level records. |
| |
| Required configuration in the mapper_spec.input_reader dictionary. |
| BUCKET_NAME_PARAM: name of the bucket to use (with no extra delimiters or |
| suffixed such as directories. |
| OBJECT_NAMES_PARAM: a list of object names or prefixes. All objects must be |
| in the BUCKET_NAME_PARAM bucket. If the name ends with a * it will be |
| treated as prefix and all objects with matching names will be read. |
| Entries should not start with a slash unless that is part of the object's |
| name. An example list could be: |
| ["my-1st-input-file", "directory/my-2nd-file", "some/other/dir/input-*"] |
| To retrieve all files "*" will match every object in the bucket. If a file |
| is listed twice or is covered by multiple prefixes it will be read twice, |
| there is no deduplication. |
| |
| Optional configuration in the mapper_sec.input_reader dictionary. |
| BUFFER_SIZE_PARAM: the size of the read buffer for each file handle. |
| DELIMITER_PARAM: if specified, turn on the shallow splitting mode. |
| The delimiter is used as a path separator to designate directory |
| hierarchy. Matching of prefixes from OBJECT_NAME_PARAM |
| will stop at the first directory instead of matching |
| all files under the directory. This allows MR to process bucket with |
| hundreds of thousands of files. |
| """ |
| |
| |
| BUCKET_NAME_PARAM = "bucket_name" |
| OBJECT_NAMES_PARAM = "objects" |
| BUFFER_SIZE_PARAM = "buffer_size" |
| DELIMITER_PARAM = "delimiter" |
| |
| |
| _ACCOUNT_ID_PARAM = "account_id" |
| |
| |
| _JSON_PICKLE = "pickle" |
| _STRING_MAX_FILES_LISTED = 10 |
| |
| |
| |
| |
| |
| |
| |
| def __init__(self, filenames, index=0, buffer_size=None, _account_id=None, |
| delimiter=None): |
| """Initialize a GoogleCloudStorageInputReader instance. |
| |
| Args: |
| filenames: A list of Google Cloud Storage filenames of the form |
| '/bucket/objectname'. |
| index: Index of the next filename to read. |
| buffer_size: The size of the read buffer, None to use default. |
| _account_id: Internal use only. See cloudstorage documentation. |
| delimiter: Delimiter used as path separator. See class doc. |
| """ |
| self._filenames = filenames |
| self._index = index |
| self._buffer_size = buffer_size |
| self._account_id = _account_id |
| self._delimiter = delimiter |
| self._bucket = None |
| self._bucket_iter = None |
| |
| def _next_file(self): |
| """Find next filename. |
| |
| self._filenames may need to be expanded via listbucket. |
| |
| Returns: |
| None if no more file is left. Filename otherwise. |
| """ |
| while True: |
| if self._bucket_iter: |
| try: |
| return self._bucket_iter.next().filename |
| except StopIteration: |
| self._bucket_iter = None |
| self._bucket = None |
| if self._index >= len(self._filenames): |
| return |
| filename = self._filenames[self._index] |
| self._index += 1 |
| if self._delimiter is None or not filename.endswith(self._delimiter): |
| return filename |
| self._bucket = cloudstorage.listbucket(filename, |
| delimiter=self._delimiter) |
| self._bucket_iter = iter(self._bucket) |
| |
| @classmethod |
| def validate(cls, mapper_spec): |
| """Validate mapper specification. |
| |
| Args: |
| mapper_spec: an instance of model.MapperSpec |
| |
| Raises: |
| BadReaderParamsError: if the specification is invalid for any reason such |
| as missing the bucket name or providing an invalid bucket name. |
| """ |
| reader_spec = _get_params(mapper_spec, allow_old=False) |
| |
| |
| if cls.BUCKET_NAME_PARAM not in reader_spec: |
| raise errors.BadReaderParamsError( |
| "%s is required for Google Cloud Storage" % |
| cls.BUCKET_NAME_PARAM) |
| try: |
| cloudstorage.validate_bucket_name( |
| reader_spec[cls.BUCKET_NAME_PARAM]) |
| except ValueError, error: |
| raise errors.BadReaderParamsError("Bad bucket name, %s" % (error)) |
| |
| |
| if cls.OBJECT_NAMES_PARAM not in reader_spec: |
| raise errors.BadReaderParamsError( |
| "%s is required for Google Cloud Storage" % |
| cls.OBJECT_NAMES_PARAM) |
| filenames = reader_spec[cls.OBJECT_NAMES_PARAM] |
| if not isinstance(filenames, list): |
| raise errors.BadReaderParamsError( |
| "Object name list is not a list but a %s" % |
| filenames.__class__.__name__) |
| for filename in filenames: |
| if not isinstance(filename, basestring): |
| raise errors.BadReaderParamsError( |
| "Object name is not a string but a %s" % |
| filename.__class__.__name__) |
| if cls.DELIMITER_PARAM in reader_spec: |
| delimiter = reader_spec[cls.DELIMITER_PARAM] |
| if not isinstance(delimiter, str): |
| raise errors.BadReaderParamsError( |
| "%s is not a string but a %s" % |
| (cls.DELIMITER_PARAM, type(delimiter))) |
| |
| @classmethod |
| def split_input(cls, mapper_spec): |
| """Returns a list of input readers. |
| |
| An equal number of input files are assigned to each shard (+/- 1). If there |
| are fewer files than shards, fewer than the requested number of shards will |
| be used. Input files are currently never split (although for some formats |
| could be and may be split in a future implementation). |
| |
| Args: |
| mapper_spec: an instance of model.MapperSpec. |
| |
| Returns: |
| A list of InputReaders. None when no input data can be found. |
| """ |
| reader_spec = _get_params(mapper_spec, allow_old=False) |
| bucket = reader_spec[cls.BUCKET_NAME_PARAM] |
| filenames = reader_spec[cls.OBJECT_NAMES_PARAM] |
| delimiter = reader_spec.get(cls.DELIMITER_PARAM) |
| account_id = reader_spec.get(cls._ACCOUNT_ID_PARAM) |
| buffer_size = reader_spec.get(cls.BUFFER_SIZE_PARAM) |
| |
| |
| all_filenames = [] |
| for filename in filenames: |
| if filename.endswith("*"): |
| all_filenames.extend( |
| [file_stat.filename for file_stat in cloudstorage.listbucket( |
| "/" + bucket + "/" + filename[:-1], delimiter=delimiter, |
| _account_id=account_id)]) |
| else: |
| all_filenames.append("/%s/%s" % (bucket, filename)) |
| |
| |
| readers = [] |
| for shard in range(0, mapper_spec.shard_count): |
| shard_filenames = all_filenames[shard::mapper_spec.shard_count] |
| if shard_filenames: |
| readers.append(cls( |
| shard_filenames, buffer_size=buffer_size, _account_id=account_id, |
| delimiter=delimiter)) |
| return readers |
| |
| @classmethod |
| def from_json(cls, state): |
| obj = pickle.loads(state[cls._JSON_PICKLE]) |
| if obj._bucket: |
| obj._bucket_iter = iter(obj._bucket) |
| return obj |
| |
| def to_json(self): |
| self._bucket_iter = None |
| return {self._JSON_PICKLE: pickle.dumps(self)} |
| |
| def next(self): |
| """Returns the next input from this input reader, a block of bytes. |
| |
| Non existent files will be logged and skipped. The file might have been |
| removed after input splitting. |
| |
| Returns: |
| The next input from this input reader in the form of a cloudstorage |
| ReadBuffer that supports a File-like interface (read, readline, seek, |
| tell, and close). An error may be raised if the file can not be opened. |
| |
| Raises: |
| StopIteration: The list of files has been exhausted. |
| """ |
| options = {} |
| if self._buffer_size: |
| options["read_buffer_size"] = self._buffer_size |
| if self._account_id: |
| options["_account_id"] = self._account_id |
| while True: |
| filename = self._next_file() |
| if filename is None: |
| raise StopIteration() |
| try: |
| handle = cloudstorage.open(filename, **options) |
| return handle |
| except cloudstorage.NotFoundError: |
| logging.warning("File %s may have been removed. Skipping file.", |
| filename) |
| |
| def __str__(self): |
| |
| num_files = len(self._filenames) |
| if num_files > self._STRING_MAX_FILES_LISTED: |
| names = "%s...%s + %d not shown" % ( |
| ",".join(self._filenames[0:self._STRING_MAX_FILES_LISTED-1]), |
| self._filenames[-1], |
| num_files - self._STRING_MAX_FILES_LISTED) |
| else: |
| names = ",".join(self._filenames) |
| |
| if self._index > num_files: |
| status = "EOF" |
| else: |
| status = "Next %s (%d of %d)" % ( |
| self._filenames[self._index], |
| self._index + 1, |
| num_files) |
| return "CloudStorage [%s, %s]" % (status, names) |
| |
| |
| class _GoogleCloudStorageRecordInputReader(_GoogleCloudStorageInputReader): |
| """Read data from a Google Cloud Storage file using LevelDB format. |
| |
| See the _GoogleCloudStorageOutputWriter for additional configuration options. |
| """ |
| |
| def __getstate__(self): |
| result = self.__dict__.copy() |
| |
| if "_record_reader" in result: |
| |
| |
| result.pop("_record_reader") |
| return result |
| |
| def next(self): |
| """Returns the next input from this input reader, a record. |
| |
| Returns: |
| The next input from this input reader in the form of a record read from |
| an LevelDB file. |
| |
| Raises: |
| StopIteration: The ordered set records has been exhausted. |
| """ |
| while True: |
| if not hasattr(self, "_cur_handle") or self._cur_handle is None: |
| |
| self._cur_handle = super(_GoogleCloudStorageRecordInputReader, |
| self).next() |
| if not hasattr(self, "_record_reader") or self._record_reader is None: |
| self._record_reader = records.RecordsReader(self._cur_handle) |
| |
| try: |
| return self._record_reader.read() |
| except EOFError: |
| self._cur_handle = None |
| self._record_reader = None |
| |
| |
| |
| class _ReducerReader(RecordsReader): |
| """Reader to read KeyValues records files from Files API.""" |
| |
| expand_parameters = True |
| |
| def __init__(self, filenames, position): |
| super(_ReducerReader, self).__init__(filenames, position) |
| self.current_key = None |
| self.current_values = None |
| |
| def __iter__(self): |
| ctx = context.get() |
| combiner = None |
| |
| if ctx: |
| combiner_spec = ctx.mapreduce_spec.mapper.params.get("combiner_spec") |
| if combiner_spec: |
| combiner = util.handler_for_name(combiner_spec) |
| |
| for binary_record in super(_ReducerReader, self).__iter__(): |
| proto = file_service_pb.KeyValues() |
| proto.ParseFromString(binary_record) |
| |
| to_yield = None |
| if self.current_key is not None and self.current_key != proto.key(): |
| to_yield = (self.current_key, self.current_values) |
| self.current_key = None |
| self.current_values = None |
| |
| if self.current_key is None: |
| self.current_key = proto.key() |
| self.current_values = [] |
| |
| if combiner: |
| combiner_result = combiner( |
| self.current_key, proto.value_list(), self.current_values) |
| |
| if not util.is_generator(combiner_result): |
| raise errors.BadCombinerOutputError( |
| "Combiner %s should yield values instead of returning them (%s)" % |
| (combiner, combiner_result)) |
| |
| self.current_values = [] |
| for value in combiner_result: |
| if isinstance(value, operation.Operation): |
| value(ctx) |
| else: |
| |
| self.current_values.append(value) |
| |
| |
| |
| |
| if not to_yield: |
| yield ALLOW_CHECKPOINT |
| else: |
| |
| self.current_values.extend(proto.value_list()) |
| |
| if to_yield: |
| yield to_yield |
| |
| yield ALLOW_CHECKPOINT |
| |
| |
| |
| if self.current_key is not None: |
| to_yield = (self.current_key, self.current_values) |
| self.current_key = None |
| self.current_values = None |
| yield to_yield |
| |
| @staticmethod |
| def encode_data(data): |
| """Encodes the given data, which may have include raw bytes. |
| |
| Works around limitations in JSON encoding, which cannot handle raw bytes. |
| |
| Args: |
| data: the data to encode. |
| |
| Returns: |
| The data encoded. |
| """ |
| return base64.b64encode(pickle.dumps(data)) |
| |
| @staticmethod |
| def decode_data(data): |
| """Decodes data encoded with the encode_data function.""" |
| return pickle.loads(base64.b64decode(data)) |
| |
| def to_json(self): |
| """Returns an input shard state for the remaining inputs. |
| |
| Returns: |
| A json-izable version of the remaining InputReader. |
| """ |
| result = super(_ReducerReader, self).to_json() |
| result["current_key"] = self.encode_data(self.current_key) |
| result["current_values"] = self.encode_data(self.current_values) |
| return result |
| |
| @classmethod |
| def from_json(cls, json): |
| """Creates an instance of the InputReader for the given input shard state. |
| |
| Args: |
| json: The InputReader state as a dict-like object. |
| |
| Returns: |
| An instance of the InputReader configured using the values of json. |
| """ |
| result = super(_ReducerReader, cls).from_json(json) |
| result.current_key = _ReducerReader.decode_data(json["current_key"]) |
| result.current_values = _ReducerReader.decode_data(json["current_values"]) |
| return result |