CssParser.php 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108
  1. <?php
  2. /** @file
  3. * CSS selector parsing classes.
  4. *
  5. * This file contains the tools necessary for parsing CSS 3 selectors.
  6. * In the future it may be expanded to handle all of CSS 3.
  7. *
  8. * The parser contained herein is has an event-based API. Implementors should
  9. * begin by implementing the {@link CssEventHandler} interface. For an example
  10. * of how this is done, see {@link CssEventHandler.php}.
  11. *
  12. * @author M Butcher <matt@aleph-null.tv>
  13. * @license http://opensource.org/licenses/lgpl-2.1.php The GNU Lesser GPL (LGPL) or an MIT-like license.
  14. */
  15. /** @addtogroup querypath_css CSS Parsing
  16. * QueryPath includes a CSS 3 Selector parser.
  17. *
  18. *
  19. * Typically the parser is not accessed directly. Most developers will use it indirectly from
  20. * qp(), htmlqp(), or one of the methods on a QueryPath object.
  21. *
  22. * This parser is modular and is not tied to QueryPath, so you can use it in your
  23. * own (non-QueryPath) projects if you wish. To dive in, start with CssEventHandler, the
  24. * event interface that works like a SAX API for CSS selectors. If you want to check out
  25. * the details, check out the parser (CssParser), scanner (CssScanner), and token list (CssToken).
  26. */
  27. require_once 'CssEventHandler.php';
  28. /**
  29. * An event handler for handling CSS 3 Selector parsing.
  30. *
  31. * This provides a standard interface for CSS 3 Selector event handling. As the
  32. * parser parses a selector, it will fire events. Implementations of CssEventHandler
  33. * can then handle the events.
  34. *
  35. * This library is inspired by the SAX2 API for parsing XML. Each component of a
  36. * selector fires an event, passing the necessary data on to the event handler.
  37. *
  38. * @ingroup querypath_css
  39. */
  40. interface CssEventHandler {
  41. /** The is-exactly (=) operator. */
  42. const isExactly = 0; // =
  43. /** The contains-with-space operator (~=). */
  44. const containsWithSpace = 1; // ~=
  45. /** The contains-with-hyphen operator (!=). */
  46. const containsWithHyphen = 2; // |=
  47. /** The contains-in-string operator (*=). */
  48. const containsInString = 3; // *=
  49. /** The begins-with operator (^=). */
  50. const beginsWith = 4; // ^=
  51. /** The ends-with operator ($=). */
  52. const endsWith = 5; // $=
  53. /** The any-element operator (*). */
  54. const anyElement = '*';
  55. /**
  56. * This event is fired when a CSS ID is encountered.
  57. * An ID begins with an octothorp: #name.
  58. *
  59. * @param string $id
  60. * The ID passed in.
  61. */
  62. public function elementID($id); // #name
  63. /**
  64. * Handle an element name.
  65. * Example: name
  66. * @param string $name
  67. * The name of the element.
  68. */
  69. public function element($name); // name
  70. /**
  71. * Handle a namespaced element name.
  72. * example: namespace|name
  73. * @param string $name
  74. * The tag name.
  75. * @param string $namespace
  76. * The namespace identifier (Not the URI)
  77. */
  78. public function elementNS($name, $namespace = NULL);
  79. /**
  80. * Handle an any-element (*) operator.
  81. * Example: *
  82. */
  83. public function anyElement(); // *
  84. /**
  85. * Handle an any-element operator that is constrained to a namespace.
  86. * Example: ns|*
  87. * @param string $ns
  88. * The namespace identifier (not the URI).
  89. */
  90. public function anyElementInNS($ns); // ns|*
  91. /**
  92. * Handle a CSS class selector.
  93. * Example: .name
  94. * @param string $name
  95. * The name of the class.
  96. */
  97. public function elementClass($name); // .name
  98. /**
  99. * Handle an attribute selector.
  100. * Example: [name=attr]
  101. * Example: [name~=attr]
  102. * @param string $name
  103. * The attribute name.
  104. * @param string $value
  105. * The value of the attribute, if given.
  106. * @param int $operation
  107. * The operation to be used for matching. See {@link CssEventHandler}
  108. * constants for a list of supported operations.
  109. */
  110. public function attribute($name, $value = NULL, $operation = CssEventHandler::isExactly); // [name=attr]
  111. /**
  112. * Handle an attribute selector bound to a specific namespace.
  113. * Example: [ns|name=attr]
  114. * Example: [ns|name~=attr]
  115. * @param string $name
  116. * The attribute name.
  117. * @param string $ns
  118. * The namespace identifier (not the URI).
  119. * @param string $value
  120. * The value of the attribute, if given.
  121. * @param int $operation
  122. * The operation to be used for matching. See {@link CssEventHandler}
  123. * constants for a list of supported operations.
  124. */
  125. public function attributeNS($name, $ns, $value = NULL, $operation = CssEventHandler::isExactly);
  126. /**
  127. * Handle a pseudo-class.
  128. * Example: :name(value)
  129. * @param string $name
  130. * The pseudo-class name.
  131. * @param string $value
  132. * The value, if one is found.
  133. */
  134. public function pseudoClass($name, $value = NULL); //:name(value)
  135. /**
  136. * Handle a pseudo-element.
  137. * Example: ::name
  138. * @param string $name
  139. * The pseudo-element name.
  140. */
  141. public function pseudoElement($name); // ::name
  142. /**
  143. * Handle a direct descendant combinator.
  144. * Example: >
  145. */
  146. public function directDescendant(); // >
  147. /**
  148. * Handle a adjacent combinator.
  149. * Example: +
  150. */
  151. public function adjacent(); // +
  152. /**
  153. * Handle an another-selector combinator.
  154. * Example: ,
  155. */
  156. public function anotherSelector(); // ,
  157. /**
  158. * Handle a sibling combinator.
  159. * Example: ~
  160. */
  161. public function sibling(); // ~ combinator
  162. /**
  163. * Handle an any-descendant combinator.
  164. * Example: ' '
  165. */
  166. public function anyDescendant(); // ' ' (space) operator.
  167. }
  168. /**
  169. * Tokens for CSS.
  170. * This class defines the recognized tokens for the parser, and also
  171. * provides utility functions for error reporting.
  172. *
  173. * @ingroup querypath_css
  174. */
  175. final class CssToken {
  176. const char = 0;
  177. const star = 1;
  178. const rangle = 2;
  179. const dot = 3;
  180. const octo = 4;
  181. const rsquare = 5;
  182. const lsquare = 6;
  183. const colon = 7;
  184. const rparen = 8;
  185. const lparen = 9;
  186. const plus = 10;
  187. const tilde = 11;
  188. const eq = 12;
  189. const pipe = 13;
  190. const comma = 14;
  191. const white = 15;
  192. const quote = 16;
  193. const squote = 17;
  194. const bslash = 18;
  195. const carat = 19;
  196. const dollar = 20;
  197. const at = 21; // This is not in the spec. Apparently, old broken CSS uses it.
  198. // In legal range for string.
  199. const stringLegal = 99;
  200. /**
  201. * Get a name for a given constant. Used for error handling.
  202. */
  203. static function name($const_int) {
  204. $a = array('character', 'star', 'right angle bracket',
  205. 'dot', 'octothorp', 'right square bracket', 'left square bracket',
  206. 'colon', 'right parenthesis', 'left parenthesis', 'plus', 'tilde',
  207. 'equals', 'vertical bar', 'comma', 'space', 'quote', 'single quote',
  208. 'backslash', 'carat', 'dollar', 'at');
  209. if (isset($a[$const_int]) && is_numeric($const_int)) {
  210. return $a[$const_int];
  211. }
  212. elseif ($const_int == 99) {
  213. return 'a legal non-alphanumeric character';
  214. }
  215. elseif ($const_int == FALSE) {
  216. return 'end of file';
  217. }
  218. return sprintf('illegal character (%s)', $const_int);
  219. }
  220. }
  221. /**
  222. * Parse a CSS selector.
  223. *
  224. * In CSS, a selector is used to identify which element or elements
  225. * in a DOM are being selected for the application of a particular style.
  226. * Effectively, selectors function as a query language for a structured
  227. * document -- almost always HTML or XML.
  228. *
  229. * This class provides an event-based parser for CSS selectors. It can be
  230. * used, for example, as a basis for writing a DOM query engine based on
  231. * CSS.
  232. *
  233. * @ingroup querypath_css
  234. */
  235. class CssParser {
  236. protected $scanner = NULL;
  237. protected $buffer = '';
  238. protected $handler = NULL;
  239. protected $strict = FALSE;
  240. protected $DEBUG = FALSE;
  241. /**
  242. * Construct a new CSS parser object. This will attempt to
  243. * parse the string as a CSS selector. As it parses, it will
  244. * send events to the CssEventHandler implementation.
  245. */
  246. public function __construct($string, CssEventHandler $handler) {
  247. $this->originalString = $string;
  248. $is = new CssInputStream($string);
  249. $this->scanner = new CssScanner($is);
  250. $this->handler = $handler;
  251. }
  252. /**
  253. * Parse the selector.
  254. *
  255. * This begins an event-based parsing process that will
  256. * fire events as the selector is handled. A CssEventHandler
  257. * implementation will be responsible for handling the events.
  258. * @throws CssParseException
  259. */
  260. public function parse() {
  261. $this->scanner->nextToken();
  262. while ($this->scanner->token !== FALSE) {
  263. // Primitive recursion detection.
  264. $position = $this->scanner->position();
  265. if ($this->DEBUG) {
  266. print "PARSE " . $this->scanner->token. "\n";
  267. }
  268. $this->selector();
  269. $finalPosition = $this->scanner->position();
  270. if ($this->scanner->token !== FALSE && $finalPosition == $position) {
  271. // If we get here, then the scanner did not pop a single character
  272. // off of the input stream during a full run of the parser, which
  273. // means that the current input does not match any recognizable
  274. // pattern.
  275. throw new CssParseException('CSS selector is not well formed.');
  276. }
  277. }
  278. }
  279. /**
  280. * A restricted parser that can only parse simple selectors.
  281. * The pseudoClass handler for this parser will throw an
  282. * exception if it encounters a pseudo-element or the
  283. * negation pseudo-class.
  284. *
  285. * @deprecated This is not used anywhere in QueryPath and
  286. * may be removed.
  287. *//*
  288. public function parseSimpleSelector() {
  289. while ($this->scanner->token !== FALSE) {
  290. if ($this->DEBUG) print "SIMPLE SELECTOR\n";
  291. $this->allElements();
  292. $this->elementName();
  293. $this->elementClass();
  294. $this->elementID();
  295. $this->pseudoClass(TRUE); // Operate in restricted mode.
  296. $this->attribute();
  297. // TODO: Need to add failure conditions here.
  298. }
  299. }*/
  300. /**
  301. * Handle an entire CSS selector.
  302. */
  303. private function selector() {
  304. if ($this->DEBUG) print "SELECTOR{$this->scanner->position()}\n";
  305. $this->consumeWhitespace(); // Remove leading whitespace
  306. $this->simpleSelectors();
  307. $this->combinator();
  308. }
  309. /**
  310. * Consume whitespace and return a count of the number of whitespace consumed.
  311. */
  312. private function consumeWhitespace() {
  313. if ($this->DEBUG) print "CONSUME WHITESPACE\n";
  314. $white = 0;
  315. while ($this->scanner->token == CssToken::white) {
  316. $this->scanner->nextToken();
  317. ++$white;
  318. }
  319. return $white;
  320. }
  321. /**
  322. * Handle one of the five combinators: '>', '+', ' ', '~', and ','.
  323. * This will call the appropriate event handlers.
  324. * @see CssEventHandler::directDescendant(),
  325. * @see CssEventHandler::adjacent(),
  326. * @see CssEventHandler::anyDescendant(),
  327. * @see CssEventHandler::anotherSelector().
  328. */
  329. private function combinator() {
  330. if ($this->DEBUG) print "COMBINATOR\n";
  331. /*
  332. * Problem: ' ' and ' > ' are both valid combinators.
  333. * So we have to track whitespace consumption to see
  334. * if we are hitting the ' ' combinator or if the
  335. * selector just has whitespace padding another combinator.
  336. */
  337. // Flag to indicate that post-checks need doing
  338. $inCombinator = FALSE;
  339. $white = $this->consumeWhitespace();
  340. $t = $this->scanner->token;
  341. if ($t == CssToken::rangle) {
  342. $this->handler->directDescendant();
  343. $this->scanner->nextToken();
  344. $inCombinator = TRUE;
  345. //$this->simpleSelectors();
  346. }
  347. elseif ($t == CssToken::plus) {
  348. $this->handler->adjacent();
  349. $this->scanner->nextToken();
  350. $inCombinator = TRUE;
  351. //$this->simpleSelectors();
  352. }
  353. elseif ($t == CssToken::comma) {
  354. $this->handler->anotherSelector();
  355. $this->scanner->nextToken();
  356. $inCombinator = TRUE;
  357. //$this->scanner->selectors();
  358. }
  359. elseif ($t == CssToken::tilde) {
  360. $this->handler->sibling();
  361. $this->scanner->nextToken();
  362. $inCombinator = TRUE;
  363. }
  364. // Check that we don't get two combinators in a row.
  365. if ($inCombinator) {
  366. $white = 0;
  367. if ($this->DEBUG) print "COMBINATOR: " . CssToken::name($t) . "\n";
  368. $this->consumeWhitespace();
  369. if ($this->isCombinator($this->scanner->token)) {
  370. throw new CssParseException("Illegal combinator: Cannot have two combinators in sequence.");
  371. }
  372. }
  373. // Check to see if we have whitespace combinator:
  374. elseif ($white > 0) {
  375. if ($this->DEBUG) print "COMBINATOR: any descendant\n";
  376. $inCombinator = TRUE;
  377. $this->handler->anyDescendant();
  378. }
  379. else {
  380. if ($this->DEBUG) print "COMBINATOR: no combinator found.\n";
  381. }
  382. }
  383. /**
  384. * Check if the token is a combinator.
  385. */
  386. private function isCombinator($tok) {
  387. $combinators = array(CssToken::plus, CssToken::rangle, CssToken::comma, CssToken::tilde);
  388. return in_array($tok, $combinators);
  389. }
  390. /**
  391. * Handle a simple selector.
  392. */
  393. private function simpleSelectors() {
  394. if ($this->DEBUG) print "SIMPLE SELECTOR\n";
  395. $this->allElements();
  396. $this->elementName();
  397. $this->elementClass();
  398. $this->elementID();
  399. $this->pseudoClass();
  400. $this->attribute();
  401. }
  402. /**
  403. * Handles CSS ID selectors.
  404. * This will call CssEventHandler::elementID().
  405. */
  406. private function elementID() {
  407. if ($this->DEBUG) print "ELEMENT ID\n";
  408. if ($this->scanner->token == CssToken::octo) {
  409. $this->scanner->nextToken();
  410. if ($this->scanner->token !== CssToken::char) {
  411. throw new CssParseException("Expected string after #");
  412. }
  413. $id = $this->scanner->getNameString();
  414. $this->handler->elementID($id);
  415. }
  416. }
  417. /**
  418. * Handles CSS class selectors.
  419. * This will call the CssEventHandler::elementClass() method.
  420. */
  421. private function elementClass() {
  422. if ($this->DEBUG) print "ELEMENT CLASS\n";
  423. if ($this->scanner->token == CssToken::dot) {
  424. $this->scanner->nextToken();
  425. $this->consumeWhitespace(); // We're very fault tolerent. This should prob through error.
  426. $cssClass = $this->scanner->getNameString();
  427. $this->handler->elementClass($cssClass);
  428. }
  429. }
  430. /**
  431. * Handle a pseudo-class and pseudo-element.
  432. *
  433. * CSS 3 selectors support separate pseudo-elements, using :: instead
  434. * of : for separator. This is now supported, and calls the pseudoElement
  435. * handler, CssEventHandler::pseudoElement().
  436. *
  437. * This will call CssEventHandler::pseudoClass() when a
  438. * pseudo-class is parsed.
  439. */
  440. private function pseudoClass($restricted = FALSE) {
  441. if ($this->DEBUG) print "PSEUDO-CLASS\n";
  442. if ($this->scanner->token == CssToken::colon) {
  443. // Check for CSS 3 pseudo element:
  444. $isPseudoElement = FALSE;
  445. if ($this->scanner->nextToken() === CssToken::colon) {
  446. $isPseudoElement = TRUE;
  447. $this->scanner->nextToken();
  448. }
  449. $name = $this->scanner->getNameString();
  450. if ($restricted && $name == 'not') {
  451. throw new CssParseException("The 'not' pseudo-class is illegal in this context.");
  452. }
  453. $value = NULL;
  454. if ($this->scanner->token == CssToken::lparen) {
  455. if ($isPseudoElement) {
  456. throw new CssParseException("Illegal left paren. Pseudo-Element cannot have arguments.");
  457. }
  458. $value = $this->pseudoClassValue();
  459. }
  460. // FIXME: This should throw errors when pseudo element has values.
  461. if ($isPseudoElement) {
  462. if ($restricted) {
  463. throw new CssParseException("Pseudo-Elements are illegal in this context.");
  464. }
  465. $this->handler->pseudoElement($name);
  466. $this->consumeWhitespace();
  467. // Per the spec, pseudo-elements must be the last items in a selector, so we
  468. // check to make sure that we are either at the end of the stream or that a
  469. // new selector is starting. Only one pseudo-element is allowed per selector.
  470. if ($this->scanner->token !== FALSE && $this->scanner->token !== CssToken::comma) {
  471. throw new CssParseException("A Pseudo-Element must be the last item in a selector.");
  472. }
  473. }
  474. else {
  475. $this->handler->pseudoClass($name, $value);
  476. }
  477. }
  478. }
  479. /**
  480. * Get the value of a pseudo-classes.
  481. *
  482. * @return string
  483. * Returns the value found from a pseudo-class.
  484. *
  485. * @todo Pseudoclasses can be passed pseudo-elements and
  486. * other pseudo-classes as values, which means :pseudo(::pseudo)
  487. * is legal.
  488. */
  489. private function pseudoClassValue() {
  490. if ($this->scanner->token == CssToken::lparen) {
  491. $buf = '';
  492. // For now, just leave pseudoClass value vague.
  493. /*
  494. // We have to peek to see if next char is a colon because
  495. // pseudo-classes and pseudo-elements are legal strings here.
  496. print $this->scanner->peek();
  497. if ($this->scanner->peek() == ':') {
  498. print "Is pseudo\n";
  499. $this->scanner->nextToken();
  500. // Pseudo class
  501. if ($this->scanner->token == CssToken::colon) {
  502. $buf .= ':';
  503. $this->scanner->nextToken();
  504. // Pseudo element
  505. if ($this->scanner->token == CssToken::colon) {
  506. $buf .= ':';
  507. $this->scanner->nextToken();
  508. }
  509. // Ident
  510. $buf .= $this->scanner->getNameString();
  511. }
  512. }
  513. else {
  514. print "fetching string.\n";
  515. $buf .= $this->scanner->getQuotedString();
  516. if ($this->scanner->token != CssToken::rparen) {
  517. $this->throwError(CssToken::rparen, $this->scanner->token);
  518. }
  519. $this->scanner->nextToken();
  520. }
  521. return $buf;
  522. */
  523. $buf .= $this->scanner->getQuotedString();
  524. return $buf;
  525. }
  526. }
  527. /**
  528. * Handle element names.
  529. * This will call the CssEventHandler::elementName().
  530. *
  531. * This handles:
  532. * <code>
  533. * name (CssEventHandler::element())
  534. * |name (CssEventHandler::element())
  535. * ns|name (CssEventHandler::elementNS())
  536. * ns|* (CssEventHandler::elementNS())
  537. * </code>
  538. */
  539. private function elementName() {
  540. if ($this->DEBUG) print "ELEMENT NAME\n";
  541. if ($this->scanner->token === CssToken::pipe) {
  542. // We have '|name', which is equiv to 'name'
  543. $this->scanner->nextToken();
  544. $this->consumeWhitespace();
  545. $elementName = $this->scanner->getNameString();
  546. $this->handler->element($elementName);
  547. }
  548. elseif ($this->scanner->token === CssToken::char) {
  549. $elementName = $this->scanner->getNameString();
  550. if ($this->scanner->token == CssToken::pipe) {
  551. // Get ns|name
  552. $elementNS = $elementName;
  553. $this->scanner->nextToken();
  554. $this->consumeWhitespace();
  555. if ($this->scanner->token === CssToken::star) {
  556. // We have ns|*
  557. $this->handler->anyElementInNS($elementNS);
  558. $this->scanner->nextToken();
  559. }
  560. elseif ($this->scanner->token !== CssToken::char) {
  561. $this->throwError(CssToken::char, $this->scanner->token);
  562. }
  563. else {
  564. $elementName = $this->scanner->getNameString();
  565. // We have ns|name
  566. $this->handler->elementNS($elementName, $elementNS);
  567. }
  568. }
  569. else {
  570. $this->handler->element($elementName);
  571. }
  572. }
  573. }
  574. /**
  575. * Check for all elements designators. Due to the new CSS 3 namespace
  576. * support, this is slightly more complicated, now, as it handles
  577. * the *|name and *|* cases as well as *.
  578. *
  579. * Calls CssEventHandler::anyElement() or CssEventHandler::elementName().
  580. */
  581. private function allElements() {
  582. if ($this->scanner->token === CssToken::star) {
  583. $this->scanner->nextToken();
  584. if ($this->scanner->token === CssToken::pipe) {
  585. $this->scanner->nextToken();
  586. if ($this->scanner->token === CssToken::star) {
  587. // We got *|*. According to spec, this requires
  588. // that the element has a namespace, so we pass it on
  589. // to the handler:
  590. $this->scanner->nextToken();
  591. $this->handler->anyElementInNS('*');
  592. }
  593. else {
  594. // We got *|name, which means the name MUST be in a namespce,
  595. // so we pass this off to elementNameNS().
  596. $name = $this->scanner->getNameString();
  597. $this->handler->elementNS($name, '*');
  598. }
  599. }
  600. else {
  601. $this->handler->anyElement();
  602. }
  603. }
  604. }
  605. /**
  606. * Handler an attribute.
  607. * An attribute can be in one of two forms:
  608. * <code>[attrName]</code>
  609. * or
  610. * <code>[attrName="AttrValue"]</code>
  611. *
  612. * This may call the following event handlers: CssEventHandler::attribute().
  613. */
  614. private function attribute() {
  615. if($this->scanner->token == CssToken::lsquare) {
  616. $attrVal = $op = $ns = NULL;
  617. $this->scanner->nextToken();
  618. $this->consumeWhitespace();
  619. if ($this->scanner->token === CssToken::at) {
  620. if ($this->strict) {
  621. throw new CssParseException('The @ is illegal in attributes.');
  622. }
  623. else {
  624. $this->scanner->nextToken();
  625. $this->consumeWhitespace();
  626. }
  627. }
  628. if ($this->scanner->token === CssToken::star) {
  629. // Global namespace... requires that attr be prefixed,
  630. // so we pass this on to a namespace handler.
  631. $ns = '*';
  632. $this->scanner->nextToken();
  633. }
  634. if ($this->scanner->token === CssToken::pipe) {
  635. // Skip this. It's a global namespace.
  636. $this->scanner->nextToken();
  637. $this->consumeWhitespace();
  638. }
  639. $attrName = $this->scanner->getNameString();
  640. $this->consumeWhitespace();
  641. // Check for namespace attribute: ns|attr. We have to peek() to make
  642. // sure that we haven't hit the |= operator, which looks the same.
  643. if ($this->scanner->token === CssToken::pipe && $this->scanner->peek() !== '=') {
  644. // We have a namespaced attribute.
  645. $ns = $attrName;
  646. $this->scanner->nextToken();
  647. $attrName = $this->scanner->getNameString();
  648. $this->consumeWhitespace();
  649. }
  650. // Note: We require that operators do not have spaces
  651. // between characters, e.g. ~= , not ~ =.
  652. // Get the operator:
  653. switch ($this->scanner->token) {
  654. case CssToken::eq:
  655. $this->consumeWhitespace();
  656. $op = CssEventHandler::isExactly;
  657. break;
  658. case CssToken::tilde:
  659. if ($this->scanner->nextToken() !== CssToken::eq) {
  660. $this->throwError(CssToken::eq, $this->scanner->token);
  661. }
  662. $op = CssEventHandler::containsWithSpace;
  663. break;
  664. case CssToken::pipe:
  665. if ($this->scanner->nextToken() !== CssToken::eq) {
  666. $this->throwError(CssToken::eq, $this->scanner->token);
  667. }
  668. $op = CssEventHandler::containsWithHyphen;
  669. break;
  670. case CssToken::star:
  671. if ($this->scanner->nextToken() !== CssToken::eq) {
  672. $this->throwError(CssToken::eq, $this->scanner->token);
  673. }
  674. $op = CssEventHandler::containsInString;
  675. break;
  676. case CssToken::dollar;
  677. if ($this->scanner->nextToken() !== CssToken::eq) {
  678. $this->throwError(CssToken::eq, $this->scanner->token);
  679. }
  680. $op = CssEventHandler::endsWith;
  681. break;
  682. case CssToken::carat:
  683. if ($this->scanner->nextToken() !== CssToken::eq) {
  684. $this->throwError(CssToken::eq, $this->scanner->token);
  685. }
  686. $op = CssEventHandler::beginsWith;
  687. break;
  688. }
  689. if (isset($op)) {
  690. // Consume '=' and go on.
  691. $this->scanner->nextToken();
  692. $this->consumeWhitespace();
  693. // So... here we have a problem. The grammer suggests that the
  694. // value here is String1 or String2, both of which are enclosed
  695. // in quotes of some sort, and both of which allow lots of special
  696. // characters. But the spec itself includes examples like this:
  697. // [lang=fr]
  698. // So some bareword support is assumed. To get around this, we assume
  699. // that bare words follow the NAME rules, while quoted strings follow
  700. // the String1/String2 rules.
  701. if ($this->scanner->token === CssToken::quote || $this->scanner->token === CssToken::squote) {
  702. $attrVal = $this->scanner->getQuotedString();
  703. }
  704. else {
  705. $attrVal = $this->scanner->getNameString();
  706. }
  707. if ($this->DEBUG) {
  708. print "ATTR: $attrVal AND OP: $op\n";
  709. }
  710. }
  711. $this->consumeWhitespace();
  712. if ($this->scanner->token != CssToken::rsquare) {
  713. $this->throwError(CssToken::rsquare, $this->scanner->token);
  714. }
  715. if (isset($ns)) {
  716. $this->handler->attributeNS($attrName, $ns, $attrVal, $op);
  717. }
  718. elseif (isset($attrVal)) {
  719. $this->handler->attribute($attrName, $attrVal, $op);
  720. }
  721. else {
  722. $this->handler->attribute($attrName);
  723. }
  724. $this->scanner->nextToken();
  725. }
  726. }
  727. /**
  728. * Utility for throwing a consistantly-formatted parse error.
  729. */
  730. private function throwError($expected, $got) {
  731. $filter = sprintf('Expected %s, got %s', CssToken::name($expected), CssToken::name($got));
  732. throw new CssParseException($filter);
  733. }
  734. }
  735. /**
  736. * Scanner for CSS selector parsing.
  737. *
  738. * This provides a simple scanner for traversing an input stream.
  739. *
  740. * @ingroup querypath_css
  741. */
  742. final class CssScanner {
  743. var $is = NULL;
  744. public $value = NULL;
  745. public $token = NULL;
  746. var $recurse = FALSE;
  747. var $it = 0;
  748. /**
  749. * Given a new input stream, tokenize the CSS selector string.
  750. * @see CssInputStream
  751. * @param CssInputStream $in
  752. * An input stream to be scanned.
  753. */
  754. public function __construct(CssInputStream $in) {
  755. $this->is = $in;
  756. }
  757. /**
  758. * Return the position of the reader in the string.
  759. */
  760. public function position() {
  761. return $this->is->position;
  762. }
  763. /**
  764. * See the next char without removing it from the stack.
  765. *
  766. * @return char
  767. * Returns the next character on the stack.
  768. */
  769. public function peek() {
  770. return $this->is->peek();
  771. }
  772. /**
  773. * Get the next token in the input stream.
  774. *
  775. * This sets the current token to the value of the next token in
  776. * the stream.
  777. *
  778. * @return int
  779. * Returns an int value corresponding to one of the CssToken constants,
  780. * or FALSE if the end of the string is reached. (Remember to use
  781. * strong equality checking on FALSE, since 0 is a valid token id.)
  782. */
  783. public function nextToken() {
  784. $tok = -1;
  785. ++$this->it;
  786. if ($this->is->isEmpty()) {
  787. if ($this->recurse) {
  788. throw new Exception("Recursion error detected at iteration " . $this->it . '.');
  789. exit();
  790. }
  791. //print "{$this->it}: All done\n";
  792. $this->recurse = TRUE;
  793. $this->token = FALSE;
  794. return FALSE;
  795. }
  796. $ch = $this->is->consume();
  797. //print __FUNCTION__ . " Testing $ch.\n";
  798. if (ctype_space($ch)) {
  799. $this->value = ' '; // Collapse all WS to a space.
  800. $this->token = $tok = CssToken::white;
  801. //$ch = $this->is->consume();
  802. return $tok;
  803. }
  804. if (ctype_alnum($ch) || $ch == '-' || $ch == '_') {
  805. // It's a character
  806. $this->value = $ch; //strtolower($ch);
  807. $this->token = $tok = CssToken::char;
  808. return $tok;
  809. }
  810. $this->value = $ch;
  811. switch($ch) {
  812. case '*':
  813. $tok = CssToken::star;
  814. break;
  815. case chr(ord('>')):
  816. $tok = CssToken::rangle;
  817. break;
  818. case '.':
  819. $tok = CssToken::dot;
  820. break;
  821. case '#':
  822. $tok = CssToken::octo;
  823. break;
  824. case '[':
  825. $tok = CssToken::lsquare;
  826. break;
  827. case ']':
  828. $tok = CssToken::rsquare;
  829. break;
  830. case ':':
  831. $tok = CssToken::colon;
  832. break;
  833. case '(':
  834. $tok = CssToken::lparen;
  835. break;
  836. case ')':
  837. $tok = CssToken::rparen;
  838. break;
  839. case '+':
  840. $tok = CssToken::plus;
  841. break;
  842. case '~':
  843. $tok = CssToken::tilde;
  844. break;
  845. case '=':
  846. $tok = CssToken::eq;
  847. break;
  848. case '|':
  849. $tok = CssToken::pipe;
  850. break;
  851. case ',':
  852. $tok = CssToken::comma;
  853. break;
  854. case chr(34):
  855. $tok = CssToken::quote;
  856. break;
  857. case "'":
  858. $tok = CssToken::squote;
  859. break;
  860. case '\\':
  861. $tok = CssToken::bslash;
  862. break;
  863. case '^':
  864. $tok = CssToken::carat;
  865. break;
  866. case '$':
  867. $tok = CssToken::dollar;
  868. break;
  869. case '@':
  870. $tok = CssToken::at;
  871. break;
  872. }
  873. // Catch all characters that are legal within strings.
  874. if ($tok == -1) {
  875. // TODO: This should be UTF-8 compatible, but PHP doesn't
  876. // have a native UTF-8 string. Should we use external
  877. // mbstring library?
  878. $ord = ord($ch);
  879. // Characters in this pool are legal for use inside of
  880. // certain strings. Extended ASCII is used here, though I
  881. // Don't know if these are really legal.
  882. if (($ord >= 32 && $ord <= 126) || ($ord >= 128 && $ord <= 255)) {
  883. $tok = CssToken::stringLegal;
  884. }
  885. else {
  886. throw new CSSParseException('Illegal character found in stream: ' . $ord);
  887. }
  888. }
  889. $this->token = $tok;
  890. return $tok;
  891. }
  892. /**
  893. * Get a name string from the input stream.
  894. * A name string must be composed of
  895. * only characters defined in CssToken:char: -_a-zA-Z0-9
  896. */
  897. public function getNameString() {
  898. $buf = '';
  899. while ($this->token === CssToken::char) {
  900. $buf .= $this->value;
  901. $this->nextToken();
  902. //print '_';
  903. }
  904. return $buf;
  905. }
  906. /**
  907. * This gets a string with any legal 'string' characters.
  908. * See CSS Selectors specification, section 11, for the
  909. * definition of string.
  910. *
  911. * This will check for string1, string2, and the case where a
  912. * string is unquoted (Oddly absent from the "official" grammar,
  913. * though such strings are present as examples in the spec.)
  914. *
  915. * Note:
  916. * Though the grammar supplied by CSS 3 Selectors section 11 does not
  917. * address the contents of a pseudo-class value, the spec itself indicates
  918. * that a pseudo-class value is a "value between parenthesis" [6.6]. The
  919. * examples given use URLs among other things, making them closer to the
  920. * definition of 'string' than to 'name'. So we handle them here as strings.
  921. */
  922. public function getQuotedString() {
  923. if ($this->token == CssToken::quote || $this->token == CssToken::squote || $this->token == CssToken::lparen) {
  924. $end = ($this->token == CssToken::lparen) ? CssToken::rparen : $this->token;
  925. $buf = '';
  926. $escape = FALSE;
  927. $this->nextToken(); // Skip the opening quote/paren
  928. // The second conjunct is probably not necessary.
  929. while ($this->token !== FALSE && $this->token > -1) {
  930. //print "Char: $this->value \n";
  931. if ($this->token == CssToken::bslash && !$escape) {
  932. // XXX: The backslash (\) is removed here.
  933. // Turn on escaping.
  934. //$buf .= $this->value;
  935. $escape = TRUE;
  936. }
  937. elseif ($escape) {
  938. // Turn off escaping
  939. $buf .= $this->value;
  940. $escape = FALSE;
  941. }
  942. elseif ($this->token === $end) {
  943. // At end of string; skip token and break.
  944. $this->nextToken();
  945. break;
  946. }
  947. else {
  948. // Append char.
  949. $buf .= $this->value;
  950. }
  951. $this->nextToken();
  952. }
  953. return $buf;
  954. }
  955. }
  956. /**
  957. * Get a string from the input stream.
  958. * This is a convenience function for getting a string of
  959. * characters that are either alphanumber or whitespace. See
  960. * the CssToken::white and CssToken::char definitions.
  961. *
  962. * @deprecated This is not used anywhere in QueryPath.
  963. *//*
  964. public function getStringPlusWhitespace() {
  965. $buf = '';
  966. if($this->token === FALSE) {return '';}
  967. while ($this->token === CssToken::char || $this->token == CssToken::white) {
  968. $buf .= $this->value;
  969. $this->nextToken();
  970. }
  971. return $buf;
  972. }*/
  973. }
  974. /**
  975. * Simple wrapper to turn a string into an input stream.
  976. * This provides a standard interface on top of an array of
  977. * characters.
  978. */
  979. class CssInputStream {
  980. protected $stream = NULL;
  981. public $position = 0;
  982. /**
  983. * Build a new CSS input stream from a string.
  984. *
  985. * @param string
  986. * String to turn into an input stream.
  987. */
  988. function __construct($string) {
  989. $this->stream = str_split($string);
  990. }
  991. /**
  992. * Look ahead one character.
  993. *
  994. * @return char
  995. * Returns the next character, but does not remove it from
  996. * the stream.
  997. */
  998. function peek() {
  999. return $this->stream[0];
  1000. }
  1001. /**
  1002. * Get the next unconsumed character in the stream.
  1003. * This will remove that character from the front of the
  1004. * stream and return it.
  1005. */
  1006. function consume() {
  1007. $ret = array_shift($this->stream);
  1008. if (!empty($ret)) {
  1009. $this->position++;
  1010. }
  1011. return $ret;
  1012. }
  1013. /**
  1014. * Check if the stream is empty.
  1015. * @return boolean
  1016. * Returns TRUE when the stream is empty, FALSE otherwise.
  1017. */
  1018. function isEmpty() {
  1019. return count($this->stream) == 0;
  1020. }
  1021. }
  1022. /**
  1023. * Exception indicating an error in CSS parsing.
  1024. *
  1025. * @ingroup querypath_css
  1026. */
  1027. class CSSParseException extends EXCEPTION {}