pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 1 | // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
Pavel Kalinnikov | d797063 | 2017-06-20 09:07:34 | [diff] [blame] | 5 | #ifndef COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_ |
| 6 | #define COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_ |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 7 | |
| 8 | #include <stddef.h> |
| 9 | |
| 10 | #include <iterator> |
| 11 | #include <type_traits> |
| 12 | |
| 13 | #include "base/logging.h" |
| 14 | #include "base/strings/string_piece.h" |
| 15 | |
Pavel Kalinnikov | d797063 | 2017-06-20 09:07:34 | [diff] [blame] | 16 | namespace url_pattern_index { |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 17 | |
| 18 | // The class used to iteratively extract N-grams from strings. An N-gram is a |
| 19 | // string consisting of N (up to 8) non-special characters, which are stored in |
| 20 | // the lowest N non-zero bytes, lower bytes corresponding to later symbols. The |
| 21 | // size of the integer type limits the maximum value of N. For example an |
| 22 | // uint64_t can store up to 8-grams. |
| 23 | // |
| 24 | // Note: If used for UTF-8 strings, the N-grams can have partial byte sequences. |
| 25 | // |
| 26 | // Template parameters: |
| 27 | // * N - the size of N-grams. |
| 28 | // * NGramType - the integer type used to encode N-grams. |
| 29 | // * IsSeparator - the type of a bool(char) functor. |
| 30 | template <size_t N, typename NGramType, typename IsSeparator> |
| 31 | class NGramExtractor { |
| 32 | public: |
| 33 | // An STL compatible input iterator over N-grams contained in a string. |
| 34 | class Iterator : public std::iterator<std::input_iterator_tag, NGramType> { |
| 35 | public: |
| 36 | // Creates an iterator, which points to the leftmost valid N-gram within the |
| 37 | // |extractor|'s string, starting from |head|. |
| 38 | Iterator(const NGramExtractor& extractor, |
| 39 | base::StringPiece::const_iterator head) |
| 40 | : extractor_(extractor), head_(head), end_(extractor.string_.end()) { |
| 41 | DCHECK_GE(head, extractor_.string_.begin()); |
| 42 | DCHECK_LE(head, end_); |
| 43 | |
| 44 | CompleteNGramFrom(0); |
| 45 | } |
| 46 | |
| 47 | bool operator==(const Iterator& rhs) const { return head_ == rhs.head_; } |
| 48 | bool operator!=(const Iterator& rhs) const { return !operator==(rhs); } |
| 49 | |
| 50 | NGramType operator*() const { return ngram_; } |
| 51 | NGramType* operator->() const { return &ngram_; } |
| 52 | |
| 53 | Iterator& operator++() { |
| 54 | ngram_ &= ~(static_cast<NGramType>(0xFFu) << 8 * (N - 1)); |
| 55 | ++head_; |
| 56 | CompleteNGramFrom(N - 1); |
| 57 | return *this; |
| 58 | } |
| 59 | |
| 60 | Iterator operator++(int) { |
| 61 | Iterator copy(*this); |
| 62 | operator++(); |
| 63 | return copy; |
| 64 | } |
| 65 | |
| 66 | private: |
| 67 | // Consumes characters starting with the one pointed to by |head_|, as many |
| 68 | // of them as needed to extend |ngram_| from its |current_length| to a |
| 69 | // length of N. Leaves |head_| pointing to the last character consumed. |
| 70 | void CompleteNGramFrom(size_t current_length) { |
| 71 | for (; head_ != end_; ++head_) { |
| 72 | if (extractor_.is_separator_(*head_)) { |
| 73 | current_length = 0; |
| 74 | ngram_ = 0; |
| 75 | } else { |
| 76 | ngram_ = ngram_ << 8 | static_cast<NGramType>(*head_); |
| 77 | if (++current_length == N) |
| 78 | break; |
| 79 | } |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | const NGramExtractor& extractor_; |
| 84 | |
| 85 | // Always points to the last character included in the current |ngram_|. |
| 86 | base::StringPiece::const_iterator head_; |
| 87 | // Always points to extractor_.string_.end(). |
| 88 | base::StringPiece::const_iterator end_; |
| 89 | |
| 90 | // Contains the N-gram currently pointed to by the iterator. Undefined if |
| 91 | // the iterator is at the end. |
| 92 | NGramType ngram_ = 0; |
| 93 | }; |
| 94 | |
| 95 | // Constructs an extractor for iterating over N-grams contained in the |
| 96 | // |string|. |is_separator| is used to determine whether a certain character |
| 97 | // is a separator and should not be contained in an N-gram. |
| 98 | NGramExtractor(base::StringPiece string, IsSeparator is_separator) |
| 99 | : string_(string), is_separator_(is_separator) {} |
| 100 | |
| 101 | Iterator begin() const { return Iterator(*this, string_.begin()); } |
| 102 | Iterator end() const { return Iterator(*this, string_.end()); } |
| 103 | |
| 104 | private: |
| 105 | static_assert(std::is_integral<NGramType>::value, "Not an integral type."); |
| 106 | static_assert(std::is_unsigned<NGramType>::value, "Not an unsigned type."); |
| 107 | static_assert(N > 0u, "N should be positive."); |
| 108 | static_assert(N <= sizeof(NGramType), "N-gram doesn't fit into the type."); |
| 109 | |
| 110 | base::StringPiece string_; |
| 111 | IsSeparator is_separator_; |
| 112 | }; |
| 113 | |
| 114 | // A helper function used to create an NGramExtractor for a |string| without |
| 115 | // knowing the direct type of the |is_separator| functor. |
| 116 | // |
| 117 | // Typical usage: |
| 118 | // const char* str = "no*abacaba*abcd"; |
| 119 | // auto extractor = CreateNGramExtractor<5, uint64_t>( |
| 120 | // str, [](char c) { return c == '*'; }); |
| 121 | // for (uint64_t ngram : extractor) { |
| 122 | // ... process the |ngram| ... |
| 123 | // } |
| 124 | template <size_t N, typename NGramType, typename IsSeparator> |
| 125 | NGramExtractor<N, NGramType, IsSeparator> CreateNGramExtractor( |
| 126 | base::StringPiece string, |
| 127 | IsSeparator is_separator) { |
| 128 | return NGramExtractor<N, NGramType, IsSeparator>(string, is_separator); |
| 129 | } |
| 130 | |
Pavel Kalinnikov | d797063 | 2017-06-20 09:07:34 | [diff] [blame] | 131 | } // namespace url_pattern_index |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 132 | |
Pavel Kalinnikov | d797063 | 2017-06-20 09:07:34 | [diff] [blame] | 133 | #endif // COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_ |