blob: d91a04674b39c5500fd91336e25b0d9b72f71035 [file] [log] [blame]
pkalinnikove77a62e2016-06-24 10:21:401// Copyright 2016 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
Pavel Kalinnikovd7970632017-06-20 09:07:345#ifndef COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
6#define COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
pkalinnikove77a62e2016-06-24 10:21:407
8#include <stddef.h>
9
10#include <iterator>
11#include <type_traits>
12
13#include "base/logging.h"
14#include "base/strings/string_piece.h"
15
Pavel Kalinnikovd7970632017-06-20 09:07:3416namespace url_pattern_index {
pkalinnikove77a62e2016-06-24 10:21:4017
18// The class used to iteratively extract N-grams from strings. An N-gram is a
19// string consisting of N (up to 8) non-special characters, which are stored in
20// the lowest N non-zero bytes, lower bytes corresponding to later symbols. The
21// size of the integer type limits the maximum value of N. For example an
22// uint64_t can store up to 8-grams.
23//
24// Note: If used for UTF-8 strings, the N-grams can have partial byte sequences.
25//
26// Template parameters:
27// * N - the size of N-grams.
28// * NGramType - the integer type used to encode N-grams.
29// * IsSeparator - the type of a bool(char) functor.
30template <size_t N, typename NGramType, typename IsSeparator>
31class NGramExtractor {
32 public:
33 // An STL compatible input iterator over N-grams contained in a string.
34 class Iterator : public std::iterator<std::input_iterator_tag, NGramType> {
35 public:
36 // Creates an iterator, which points to the leftmost valid N-gram within the
37 // |extractor|'s string, starting from |head|.
38 Iterator(const NGramExtractor& extractor,
39 base::StringPiece::const_iterator head)
40 : extractor_(extractor), head_(head), end_(extractor.string_.end()) {
41 DCHECK_GE(head, extractor_.string_.begin());
42 DCHECK_LE(head, end_);
43
44 CompleteNGramFrom(0);
45 }
46
47 bool operator==(const Iterator& rhs) const { return head_ == rhs.head_; }
48 bool operator!=(const Iterator& rhs) const { return !operator==(rhs); }
49
50 NGramType operator*() const { return ngram_; }
51 NGramType* operator->() const { return &ngram_; }
52
53 Iterator& operator++() {
54 ngram_ &= ~(static_cast<NGramType>(0xFFu) << 8 * (N - 1));
55 ++head_;
56 CompleteNGramFrom(N - 1);
57 return *this;
58 }
59
60 Iterator operator++(int) {
61 Iterator copy(*this);
62 operator++();
63 return copy;
64 }
65
66 private:
67 // Consumes characters starting with the one pointed to by |head_|, as many
68 // of them as needed to extend |ngram_| from its |current_length| to a
69 // length of N. Leaves |head_| pointing to the last character consumed.
70 void CompleteNGramFrom(size_t current_length) {
71 for (; head_ != end_; ++head_) {
72 if (extractor_.is_separator_(*head_)) {
73 current_length = 0;
74 ngram_ = 0;
75 } else {
76 ngram_ = ngram_ << 8 | static_cast<NGramType>(*head_);
77 if (++current_length == N)
78 break;
79 }
80 }
81 }
82
83 const NGramExtractor& extractor_;
84
85 // Always points to the last character included in the current |ngram_|.
86 base::StringPiece::const_iterator head_;
87 // Always points to extractor_.string_.end().
88 base::StringPiece::const_iterator end_;
89
90 // Contains the N-gram currently pointed to by the iterator. Undefined if
91 // the iterator is at the end.
92 NGramType ngram_ = 0;
93 };
94
95 // Constructs an extractor for iterating over N-grams contained in the
96 // |string|. |is_separator| is used to determine whether a certain character
97 // is a separator and should not be contained in an N-gram.
98 NGramExtractor(base::StringPiece string, IsSeparator is_separator)
99 : string_(string), is_separator_(is_separator) {}
100
101 Iterator begin() const { return Iterator(*this, string_.begin()); }
102 Iterator end() const { return Iterator(*this, string_.end()); }
103
104 private:
105 static_assert(std::is_integral<NGramType>::value, "Not an integral type.");
106 static_assert(std::is_unsigned<NGramType>::value, "Not an unsigned type.");
107 static_assert(N > 0u, "N should be positive.");
108 static_assert(N <= sizeof(NGramType), "N-gram doesn't fit into the type.");
109
110 base::StringPiece string_;
111 IsSeparator is_separator_;
112};
113
114// A helper function used to create an NGramExtractor for a |string| without
115// knowing the direct type of the |is_separator| functor.
116//
117// Typical usage:
118// const char* str = "no*abacaba*abcd";
119// auto extractor = CreateNGramExtractor<5, uint64_t>(
120// str, [](char c) { return c == '*'; });
121// for (uint64_t ngram : extractor) {
122// ... process the |ngram| ...
123// }
124template <size_t N, typename NGramType, typename IsSeparator>
125NGramExtractor<N, NGramType, IsSeparator> CreateNGramExtractor(
126 base::StringPiece string,
127 IsSeparator is_separator) {
128 return NGramExtractor<N, NGramType, IsSeparator>(string, is_separator);
129}
130
Pavel Kalinnikovd7970632017-06-20 09:07:34131} // namespace url_pattern_index
pkalinnikove77a62e2016-06-24 10:21:40132
Pavel Kalinnikovd7970632017-06-20 09:07:34133#endif // COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_