init: Add C++ tokenizer. Adds a C++ tokenizer along with unit tests. This tokenizer will replace the current C implementation which does a poor job of keeping track of pointers. This CL is a prerequisite for up coming changes to the parser. This CL does not wire up this tokenizer and changes no exsiting code. All that builds is the unit tests. Change-Id: Iec3740bce7153640adc5e5bbdc57e644cedf0038 TEST: Unit tests all pass. No leaks under valgrind BUG: 22843198

commit: 220ca84223dca5aa7a58c1a941d745c1387d29be [log] [tgz]
author: Lee Campbell <leecam@google.com> Thu Jul 30 09:27:11 2015 -0700
committer: Lee Campbell <leecam@google.com> Thu Jul 30 18:45:17 2015 +0000
tree: f533e0cccaec4631dd1d37f5f6332122bb7ce418
parent: fac3bf35af412292e1a955510095c4234dc0fd53 [diff] [blame]
diff --git a/init/parser/tokenizer.h b/init/parser/tokenizer.h
new file mode 100644
index 0000000..40a22b1
--- /dev/null
+++ b/init/parser/tokenizer.h

@@ -0,0 +1,69 @@
+// Copyright (C) 2015 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+namespace init {
+
+// Used to tokenize a std::string.
+// Call Next() to advance through each token until it returns false,
+// indicating there are no more tokens left in the string.
+// The current token can be accessed with current(), which returns
+// a Token.
+// Supported tokens are:
+// TOK_START - Next() has yet to be called
+// TOK_END - At the end of string
+// TOK_NEWLINE - The end of a line denoted by \n.
+// TOK_TEXT - A word.
+// Comments are denoted with '#' and the tokenizer will ignore
+// the rest of the line.
+// Double quotes can be used to insert whitespace into words.
+// A backslash at the end of a line denotes continuation and
+// a TOK_NEWLINE will not be generated for that line.
+class Tokenizer {
+ public:
+  Tokenizer(const std::string& data);
+  ~Tokenizer();
+
+  enum TokenType { TOK_START, TOK_END, TOK_NEWLINE, TOK_TEXT };
+  struct Token {
+    TokenType type;
+    std::string text;
+  };
+
+  // Returns the curret token.
+  const Token& current();
+
+  // Move to the next token, returns false at the end of input.
+  bool Next();
+
+ private:
+  void GetData();
+  void AdvChar();
+  void AdvText();
+  void AdvUntil(char x);
+  void AdvWhiteSpace();
+  void StartText();
+  void EndText();
+
+  const std::string& data_;
+  Token current_;
+
+  bool eof_;
+  size_t pos_;
+  char cur_char_;
+  size_t tok_start_;
+};
+
+}  // namespace init
commit	220ca84223dca5aa7a58c1a941d745c1387d29be	[log] [tgz]
author	Lee Campbell <leecam@google.com>	Thu Jul 30 09:27:11 2015 -0700
committer	Lee Campbell <leecam@google.com>	Thu Jul 30 18:45:17 2015 +0000
tree	f533e0cccaec4631dd1d37f5f6332122bb7ce418
parent	fac3bf35af412292e1a955510095c4234dc0fd53 [diff] [blame]