Changeset 19 for trunk


Ignore:
Timestamp:
02/27/2008 05:56:24 AM (18 years ago)
Author:
dsowen
Message:
  • Added LEX-ALL helper function.
  • Added longest-match logic.
  • Updated documentation.
Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/example.lisp

    r18 r19  
    2929(defun un-dquote (s) (regex-replace-all "\"\"" (snip s) "\""))
    3030
    31 (deflexer scan-csv
     31(deflexer scan-csv (:priority-only t)
    3232  ("," comma)
    3333  ("[^\"',]+" value)
     
    3535  ("\"(?:[^\"]|\"\")*\"" value un-dquote))
    3636
    37 (defun scan-all (input)
    38   (labels ((scan (start tokens)
    39              (if (> (length input) start)
    40                  (multiple-value-bind (class image remainder)
    41                      (scan-csv input start)
    42                    (when class
    43                      (scan remainder (cons (cons class image) tokens))))
    44                  (nreverse tokens))))
    45     (scan 0 '())))
    4637
    47 (scan-all "no quotes,'a ''quote''',\"another \"\"quote\"\"\"")
     38
     39(lex-all 'scan-csv "no quotes,'a ''quote''',\"another \"\"quote\"\"\"")
  • trunk/lex.lisp

    r18 r19  
    1818(defpackage #:dso-lex
    1919    (:documentation "Allows the definition of lexers.  See DEFLEXER.")
    20   (:use #:cl #:cl-ppcre)
    21   (:export #:deflexer #:make-lexer))
     20  (:use #:cl #:cl-ppcre #:dso-util)
     21  (:export #:deflexer #:make-lexer #:lex-all))
    2222
    2323(in-package #:dso-lex)
    2424
    2525
     26
     27;;; regex manipulation
     28
     29(defun anchor-and-mode (regex)
     30  `(:sequence (:flags :single-line-mode-p) :start-anchor ,regex))
     31
     32(defun wrap (regex) (anchor-and-mode `(:regex ,regex)))
    2633
    2734(defun combine (regex-list)
     
    3037                 regex-list)))
    3138    (when (rest mapped) (setq mapped `((:alternation ,@mapped))))
    32     `(:sequence (:flags :single-line-mode-p) :start-anchor ,@mapped)))
     39    (anchor-and-mode (car mapped))))
     40
     41
     42
     43;;; creating lexing forms
     44
     45(defun break-defs (defs)
     46  (let (regexs classes filters)
     47    (dolist (d (reverse defs) (values regexs classes filters))
     48      (destructuring-bind (regex class &optional filter) d
     49        (push regex regexs)
     50        (push class classes)
     51        (push filter filters)))))
     52
     53(defun greedy-lexer-form (input-var start-var defs)
     54  (multiple-value-bind (regexs classes filters) (break-defs defs)
     55    (setf regexs (mapcar 'wrap regexs))
     56    `(let ((classes ,(coerce classes 'vector))
     57           (filters ,(coerce filters 'vector))
     58           max
     59           at)
     60       ,@(mapcar
     61          (lambda (i)
     62            `(let ((end (nth-value 1 (scan ',(nth i regexs) ,input-var :start ,start-var))))
     63               (format t "scanner ~A ended at ~A~%" ,i end)
     64               (when (and end (or (null at) (> end max)))
     65                 (setf max end
     66                       at ,i))))
     67          (range (length regexs)))
     68       (when at
     69         (let ((image (make-array (- max ,start-var)
     70                                  :element-type 'character
     71                                  :displaced-to ,input-var
     72                                  :displaced-index-offset ,start-var))
     73               (filter (aref filters at)))
     74           (values (aref classes at)
     75                   (if filter (funcall filter image) image)
     76                   max))))))
    3377
    3478(defun lexer-form (input-var start-var defs)
     
    5296
    5397
    54 (defun make-lexer (defs)
     98;;; creating lexing functions
     99
     100(defun make-lexer (defs &key priority-only)
    55101  "Returns a lexer function.  The DEFS consists of token-class
    56102definitions, each being a list of a regular expression, the name of
    57103the class, and an optional filter.  The returned function takes as
    58 arguments an input sequence and an optional start position.
     104arguments an input sequence and an optional start position, and
     105returning the matched token-class, image, and image-length as values.
    59106
    60 Currently, matching is done using *only* priority (first match wins),
    61 and does not look for the longest match.
     107Unless PRIORITY-ONLY is true, the longest match will win, and
     108rule-priority will only be used to break ties.  Otherwise, the first
     109match wins.
    62110
    63111Example:
    64112
    65 (let ((lexer (make-lexer '((\"[0-9]+\" number parse-integer)
    66                            (\"[a-zA-Z]\" letter)))))
    67   (funcall lexer \"2pi\" 1))"
     113 (let ((lexer (make-lexer '((\"[0-9]+\" number parse-integer)
     114                            (\"[a-zA-Z]\" letter)))))
     115   (funcall lexer \"2pi\" 1))"
    68116  (eval `(lambda (input &optional (start 0))
    69            ,(lexer-form 'input 'start defs))))
     117           ,(if priority-only
     118                (lexer-form 'input 'start defs)
     119                (greedy-lexer-form 'input 'start defs)))))
    70120
    71 (defmacro deflexer (name &body defs)
    72   "Defines a lexer, called as a function of the given NAME.  The body
     121(defmacro deflexer (name (&key priority-only) &body defs)
     122  "Defines a lexer, called as a function of the given NAME, and returning
     123the matched token-class, image, and image-length as values.  The body
    73124consists of token-class definitions, each being a list of a regular
    74125expression, the name of the class, and an optional filter.
    75126
    76 Currently, matching is done using *only* priority (first match wins),
    77 and does not look for the longest match.
     127Unless PRIORITY-ONLY is true, the longest match will win, and
     128rule-priority will only be used to break ties.  Otherwise, the first
     129match wins.
    78130
    79131Example:
    80132
    81 (deflexer lexer
    82   (\"[0-9]+\" number parse-integer)
    83   (\"[a-zA-Z]\" letter))
     133 (deflexer lexer ()
     134   (\"[0-9]+\" number parse-integer)
     135   (\"[a-zA-Z]\" letter))
    84136
    85 (lexer \"2pi\" 1)"
     137 (lexer \"2pi\" 1)"
    86138  `(defun ,name (input &optional (start 0))
    87      ,(lexer-form 'input 'start defs)))
     139     ,(if priority-only
     140          (lexer-form 'input 'start defs)
     141          (greedy-lexer-form 'input 'start defs))))
     142
     143(defun lex-all (lexer input)
     144  (labels ((scan (start tokens)
     145             (if (> (length input) start)
     146                 (multiple-value-bind (class image remainder)
     147                     (funcall lexer input start)
     148                   (when class
     149                     (scan remainder (cons (cons class image) tokens))))
     150                 (nreverse tokens))))
     151    (scan 0 '())))
Note: See TracChangeset for help on using the changeset viewer.