You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

286 lines
8.5 KiB

  1. <?php
  2. /**
  3. * Hoa
  4. *
  5. *
  6. * @license
  7. *
  8. * New BSD License
  9. *
  10. * Copyright © 2007-2016, Hoa community. All rights reserved.
  11. *
  12. * Redistribution and use in source and binary forms, with or without
  13. * modification, are permitted provided that the following conditions are met:
  14. * * Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * * Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * * Neither the name of the Hoa nor the names of its contributors may be
  20. * used to endorse or promote products derived from this software without
  21. * specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  24. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
  27. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  28. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  29. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  30. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  31. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  32. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33. * POSSIBILITY OF SUCH DAMAGE.
  34. */
  35. namespace Hoa\Compiler\Llk;
  36. use Hoa\Compiler;
  37. /**
  38. * Class \Hoa\Compiler\Llk\Lexer.
  39. *
  40. * PP lexer.
  41. *
  42. * @copyright Copyright © 2007-2016 Hoa community
  43. * @license New BSD License
  44. */
  45. class Lexer
  46. {
  47. /**
  48. * Lexer state.
  49. *
  50. * @var array
  51. */
  52. protected $_lexerState = null;
  53. /**
  54. * Text.
  55. *
  56. * @var string
  57. */
  58. protected $_text = null;
  59. /**
  60. * Tokens.
  61. *
  62. * @var array
  63. */
  64. protected $_tokens = [];
  65. /**
  66. * Namespace stacks.
  67. *
  68. * @var \SplStack
  69. */
  70. protected $_nsStack = null;
  71. /**
  72. * Text tokenizer: splits the text in parameter in an ordered array of
  73. * tokens.
  74. *
  75. * @param string $text Text to tokenize.
  76. * @param array $tokens Tokens to be returned.
  77. * @return array
  78. * @throws \Hoa\Compiler\Exception\UnrecognizedToken
  79. */
  80. public function lexMe($text, array $tokens)
  81. {
  82. $this->_text = $text;
  83. $this->_tokens = $tokens;
  84. $this->_nsStack = null;
  85. $offset = 0;
  86. $maxOffset = strlen($this->_text);
  87. $tokenized = [];
  88. $this->_lexerState = 'default';
  89. $stack = false;
  90. foreach ($this->_tokens as &$tokens) {
  91. $_tokens = [];
  92. foreach ($tokens as $fullLexeme => $regex) {
  93. if (false === strpos($fullLexeme, ':')) {
  94. $_tokens[$fullLexeme] = [$regex, null];
  95. continue;
  96. }
  97. list($lexeme, $namespace) = explode(':', $fullLexeme, 2);
  98. $stack |= ('__shift__' === substr($namespace, 0, 9));
  99. unset($tokens[$fullLexeme]);
  100. $_tokens[$lexeme] = [$regex, $namespace];
  101. }
  102. $tokens = $_tokens;
  103. }
  104. if (true == $stack) {
  105. $this->_nsStack = new \SplStack();
  106. }
  107. while ($offset < $maxOffset) {
  108. $nextToken = $this->nextToken($offset);
  109. if (null === $nextToken) {
  110. throw new Compiler\Exception\UnrecognizedToken(
  111. 'Unrecognized token "%s" at line 1 and column %d:' .
  112. "\n" . '%s' . "\n" .
  113. str_repeat(' ', mb_strlen(substr($text, 0, $offset))) . '↑',
  114. 0,
  115. [
  116. mb_substr(substr($text, $offset), 0, 1),
  117. $offset + 1,
  118. $text
  119. ],
  120. 1,
  121. $offset
  122. );
  123. }
  124. if (true === $nextToken['keep']) {
  125. $nextToken['offset'] = $offset;
  126. $tokenized[] = $nextToken;
  127. }
  128. $offset += strlen($nextToken['value']);
  129. }
  130. $tokenized[] = [
  131. 'token' => 'EOF',
  132. 'value' => 'EOF',
  133. 'length' => 0,
  134. 'namespace' => 'default',
  135. 'keep' => true,
  136. 'offset' => $offset
  137. ];
  138. return $tokenized;
  139. }
  140. /**
  141. * Compute the next token recognized at the beginning of the string.
  142. *
  143. * @param int $offset Offset.
  144. * @return array
  145. * @throws \Hoa\Compiler\Exception\Lexer
  146. */
  147. protected function nextToken($offset)
  148. {
  149. $tokenArray = &$this->_tokens[$this->_lexerState];
  150. foreach ($tokenArray as $lexeme => $bucket) {
  151. list($regex, $nextState) = $bucket;
  152. if (null === $nextState) {
  153. $nextState = $this->_lexerState;
  154. }
  155. $out = $this->matchLexeme($lexeme, $regex, $offset);
  156. if (null !== $out) {
  157. $out['namespace'] = $this->_lexerState;
  158. $out['keep'] = 'skip' !== $lexeme;
  159. if ($nextState !== $this->_lexerState) {
  160. $shift = false;
  161. if (null !== $this->_nsStack &&
  162. 0 !== preg_match('#^__shift__(?:\s*\*\s*(\d+))?$#', $nextState, $matches)) {
  163. $i = isset($matches[1]) ? intval($matches[1]) : 1;
  164. if ($i > ($c = count($this->_nsStack))) {
  165. throw new Compiler\Exception\Lexer(
  166. 'Cannot shift namespace %d-times, from token ' .
  167. '%s in namespace %s, because the stack ' .
  168. 'contains only %d namespaces.',
  169. 1,
  170. [
  171. $i,
  172. $lexeme,
  173. $this->_lexerState,
  174. $c
  175. ]
  176. );
  177. }
  178. while (1 <= $i--) {
  179. $previousNamespace = $this->_nsStack->pop();
  180. }
  181. $nextState = $previousNamespace;
  182. $shift = true;
  183. }
  184. if (!isset($this->_tokens[$nextState])) {
  185. throw new Compiler\Exception\Lexer(
  186. 'Namespace %s does not exist, called by token %s ' .
  187. 'in namespace %s.',
  188. 2,
  189. [
  190. $nextState,
  191. $lexeme,
  192. $this->_lexerState
  193. ]
  194. );
  195. }
  196. if (null !== $this->_nsStack && false === $shift) {
  197. $this->_nsStack[] = $this->_lexerState;
  198. }
  199. $this->_lexerState = $nextState;
  200. }
  201. return $out;
  202. }
  203. }
  204. return null;
  205. }
  206. /**
  207. * Check if a given lexeme is matched at the beginning of the text.
  208. *
  209. * @param string $lexeme Name of the lexeme.
  210. * @param string $regex Regular expression describing the lexeme.
  211. * @param int $offset Offset.
  212. * @return array
  213. * @throws \Hoa\Compiler\Exception\Lexer
  214. */
  215. protected function matchLexeme($lexeme, $regex, $offset)
  216. {
  217. $_regex = str_replace('#', '\#', $regex);
  218. $preg = preg_match(
  219. '#\G(?|' . $_regex . ')#u',
  220. $this->_text,
  221. $matches,
  222. 0,
  223. $offset
  224. );
  225. if (0 === $preg) {
  226. return null;
  227. }
  228. if ('' === $matches[0]) {
  229. throw new Compiler\Exception\Lexer(
  230. 'A lexeme must not match an empty value, which is the ' .
  231. 'case of "%s" (%s).',
  232. 3,
  233. [$lexeme, $regex]
  234. );
  235. }
  236. return [
  237. 'token' => $lexeme,
  238. 'value' => $matches[0],
  239. 'length' => mb_strlen($matches[0])
  240. ];
  241. }
  242. }