;;;; regex.scm - Unit for using the GNU regex package
;
; Copyright (c) 2000-2002, Felix L. Winkelmann
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following
; conditions are met:
;
;   Redistributions of source code must retain the above copyright notice, this list of conditions and the following
;     disclaimer. 
;   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
;     disclaimer in the documentation and/or other materials provided with the distribution. 
;   Neither the name of the author nor the names of its contributors may be used to endorse or promote
;     products derived from this software without specific prior written permission. 
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
; POSSIBILITY OF SUCH DAMAGE.
;
; Send bugs, suggestions and ideas to: 
;
; felix@call-with-current-continuation.org
;
; Felix L. Winkelmann
; Steinweg 1A
; 37130 Gleichen, OT Weissenborn
; Germany


(declare 
  (unit regex)
  (standard-bindings)
  (extended-bindings)
  (interrupts-disabled)
  (no-bound-checks)
  (bound-to-procedure
   ##sys#check-string ##sys#check-exact ##sys#make-pointer ##sys#cons ##sys#size ##sys#slot
   ##regexp#compile ##regexp#gather-results ##regexp#re-match ##regexp#re-search
   ##regexp#re-compile-pattern)
  (foreign-declare "
#if defined(__FreeBSD__) || defined(__NetBSD__)
# include <gnuregex.h>
#else
# include <regex.h>
#endif

#define C_regexp_initialize (re_syntax_options = RE_INTERVALS | RE_NO_BK_BRACES | RE_NO_BK_PARENS | RE_NO_BK_VBAR)
#define C_regexp_alloc_buffer(ptr) (C_set_block_item((ptr), 0, (C_word)calloc(1, sizeof(struct re_pattern_buffer))))
#define C_regexp_alloc_registers(ptr) C_set_block_item((ptr), 0, (C_word)malloc(sizeof(struct re_registers)))
#define C_regexp_count_matches(ptr) C_fix(((struct re_pattern_buffer *)C_slot(ptr, 0))->re_nsub + 1)
#define C_regexp_register_start(rptr, i) C_fix(((struct re_registers *)C_slot(rptr, 0))->start[ C_unfix(i) ])
#define C_regexp_register_end(rptr, i) C_fix(((struct re_registers *)C_slot(rptr, 0))->end[ C_unfix(i) ])") )

(cond-expand
 [unsafe
  (eval-when (compile)
    (define-macro (##sys#check-structure x y) '(##core#undefined))
    (define-macro (##sys#check-range x y z) '(##core#undefined))
    (define-macro (##sys#check-pair x) '(##core#undefined))
    (define-macro (##sys#check-list x) '(##core#undefined))
    (define-macro (##sys#check-symbol x) '(##core#undefined))
    (define-macro (##sys#check-string x) '(##core#undefined))
    (define-macro (##sys#check-char x) '(##core#undefined))
    (define-macro (##sys#check-exact x) '(##core#undefined))
    (define-macro (##sys#check-port x) '(##core#undefined))
    (define-macro (##sys#check-number x) '(##core#undefined))
    (define-macro (##sys#check-byte-vector x) '(##core#undefined)) ) ]
 [else] )

(register-feature! 'regex)


;;; Create global pattern buffer and registers and initalize:

(define-constant ##regexp#buffer-count 5)

(define ##regexp#buffers
  (list (cons "" (##sys#make-pointer))
	(cons "" (##sys#make-pointer))
	(cons "" (##sys#make-pointer))
	(cons "" (##sys#make-pointer))
	(cons "" (##sys#make-pointer)) ) )

(define ##regexp#buffer-index 0)
(define ##regexp#registers (##sys#make-pointer))

(define-foreign-variable ##regexp#initialize void "C_regexp_initialize")
##regexp#initialize
(for-each (lambda (b) (##core#inline "C_regexp_alloc_buffer" (cdr b))) ##regexp#buffers)
(set! ##regexp#buffers (list->vector ##regexp#buffers))
(##core#inline "C_regexp_alloc_registers" ##regexp#registers)


;;; Compile regular expression into pattern buffer:

(define ##regexp#re-compile-pattern
  (foreign-lambda int "re_compile_pattern" c-string int c-pointer) )

(define ##regexp#compile
  (let ([error error])
    (lambda (regexp)
      (##sys#check-string regexp)
      (let ([index #f])
	(let loop ([i 0])
	  (cond [(fx>= i ##regexp#buffer-count)
		 (set! index ##regexp#buffer-index)
		 (set! ##regexp#buffer-index (fx+ index 1)) 
		 (when (fx>= ##regexp#buffer-index ##regexp#buffer-count)
		   (set! ##regexp#buffer-index 0) ) ]
		[(string=? regexp (##sys#slot (##sys#slot ##regexp#buffers i) 0))
		 (set! index i) ]
		[else (loop (fx+ i 1))] ) )
	(let ([b (##sys#slot ##regexp#buffers index)])
	  (if (not (eq? 0 (##regexp#re-compile-pattern regexp (##sys#size regexp) (##sys#slot b 1))))
	      (error "can not compile regular expression" regexp) 
	      (##sys#setslot b 0 regexp) )
	  (##sys#slot b 1) ) ) ) ) )


;;; Gather matched result strings or positions:

(define (##regexp#gather-result-positions result b)
  (and (fx>= result 0)
       (let ([n (##core#inline "C_regexp_count_matches" b)])
	 (let loop ([i 0])
	   (if (fx>= i n)
	       '()
	       (let ([start (##core#inline "C_regexp_register_start" ##regexp#registers i)])
		 (cons
		  (if (fx>= start 0)
		      (cons start (cons (##core#inline "C_regexp_register_end" ##regexp#registers i) '()))
		      #f)
		  (loop (fx+ i 1)) ) ) ) ) ) ) )

(define ##regexp#gather-results
  (let ((substring substring))
    (lambda (result str b)
      (let ((ps (##regexp#gather-result-positions result b)))
	(and ps
	     (##sys#map (lambda (poss) (and poss (apply substring str poss)))
		   ps) ) ) ) ) )


;;; Match string with regular expression:

(define ##regexp#re-match
  (foreign-lambda int "re_match" c-pointer c-string int int c-pointer) )

(let ([b #f])

  (define (prepare regexp str start)
    (##sys#check-string str)
    (let ([si (if (pair? start) (##sys#slot start 0) 0)])
      (##sys#check-exact si)
      (set! b (##regexp#compile regexp))
      (##regexp#re-match b str (##sys#size str) si ##regexp#registers) ) )

  (set! string-match
    (lambda (regexp str . start)
      (let ([m (prepare regexp str start)])
	(##regexp#gather-results m str b) ) ) )

  (set! string-match-positions
    (lambda (regexp str . start)
      (let ([m (prepare regexp str start)])
	(##regexp#gather-result-positions m b) ) ) ) )


;;; Search string with regular expression:

(define ##regexp#re-search
  (foreign-lambda int "re_search" c-pointer c-string int int int c-pointer) )

(let ([b #f])

  (define (prepare regexp str start-and-range)
    (##sys#check-string str)
    (let* ([slen (##sys#size str)]
	   [range (and (##core#inline "C_blockp" start-and-range) 
		       (##sys#slot start-and-range 1) ) ]
	   [si (if range (##sys#slot start-and-range 0) 0)]
	   [ri (if (##core#inline "C_blockp" range) (##sys#slot range 0) slen)] )
      (##sys#check-exact si)
      (##sys#check-exact ri)
      (set! b (##regexp#compile regexp))
      (##regexp#re-search b str slen si ri ##regexp#registers) ) )

  (set! string-search 
    (lambda (regexp str . start-and-range)
      (let ([s (prepare regexp str start-and-range)])
	(##regexp#gather-results s str b) ) ) )

  (set! string-search-positions
    (lambda (regexp str . start-and-range)
      (let ([s (prepare regexp str start-and-range)])
	(##regexp#gather-result-positions s b) ) ) ) )


;;; Split string into fields:

(define string-split-fields
  (let ([error error]
	[reverse reverse]
	[substring substring]
	[string-search-positions string-search-positions] )
    (lambda (regexp str . mode-and-start)
      (##sys#check-string regexp)
      (##sys#check-string str)
      (let* ([argc (length mode-and-start)]
	     [len (##sys#size str)]
	     [mode (if (fx> argc 0) (car mode-and-start) #t)]
	     [start (if (fx> argc 1) (cadr mode-and-start) 0)] 
	     [fini (case mode
		     [(#:suffix)
		      (lambda (ms start)
			(if (fx< start len)
			    (error "record does not end with suffix" str regexp)
			    (reverse ms) ) ) ]
		     [(#:infix)
		      (lambda (ms start)
			(if (and (null? ms) (fx>= start len))
			    '()
			    (reverse (cons (substring str start len) ms)) ) ) ]
		     [else (lambda (ms start) (reverse ms)) ] ) ]
	     [fetch (case mode
		      [(#:infix #:suffix) (lambda (start from to) (substring str start from))]
		      [else (lambda (start from to) (substring str from to))] ) ] )
	(let loop ([ms '()] [start start])
	  (let ([m (string-search-positions regexp str start)])
	    (if m
		(let* ([mp (##sys#slot m 0)]
		       [from (##sys#slot mp 0)]
		       [to (cadr mp)] )
		  (loop (cons (fetch start from to) ms) to) )
		(fini ms start) ) ) ) ) ) ) )


;;; Substitute matching strings:

(define string-substitute
  (let ([substring substring]
	[reverse reverse]
	[make-string make-string]
	[string-search-positions string-search-positions] )
    (lambda (regex subst string . flag)
      (##sys#check-string subst)
      (let* ([which (if (pair? flag) (car flag) 1)]
	     [substlen (##sys#size subst)]
	     [substlen-1 (fx- substlen 1)]
	     [result '()] 
	     [total 0] )

	(define (push x) 
	  (set! result (cons x result))
	  (set! total (fx+ total (##sys#size x))) )
	
	(define (substitute matches)
	  (let loop ([start 0] [index 0])
	    (if (fx>= index substlen-1)
		(push (if (fx= start 0) subst (substring subst start substlen)))
		(let ([c (##core#inline "C_subchar" subst index)]
		      [index+1 (fx+ index 1)] )
		  (if (char=? c #\\) 
		      (let ([c2 (##core#inline "C_subchar" subst index+1)])
			(if (not (char=? #\\ c2))
			    (let ([mi (list-ref matches (fx- (char->integer c2) 48))])
			      (push (substring subst start index))
			      (push (substring string (car mi) (cadr mi))) 
			      (loop (fx+ index 2) index+1) )
			    (loop start (fx+ index+1 1)) ) )
		      (loop start index+1) ) ) ) ) )

	(define (concatenate strs)
	  (let ([str (make-string total)])
	    (let loop ([ss strs] [index 0])
	      (if (null? ss) 
		  str
		  (let* ([si (car ss)]
			 [len (##sys#size si)] )
		    (##core#inline "C_substring_copy" si str 0 len index)
		    (loop (cdr ss) (fx+ index len)) ) ) ) ) )

	(let loop ([index 0] [count 1])
	  (let ([matches (string-search-positions regex string index)])
	    (cond [matches
		   (let* ([range (car matches)]
			  [upto (cadr range)] )
		     (cond [(or (not (fixnum? which)) (fx= count which))
			    (push (substring string index (car range)))
			    (substitute matches)
			    (loop upto #f) ]
			   [else
			    (push (substring string index upto))
			    (loop upto (fx+ count 1)) ] ) ) ]
		  [else
		   (push (substring string index (##sys#size string)))
		   (concatenate (reverse result)) ] ) ) ) ) ) ) )


;;; Some useful things:

(define pattern->regexp
  (let ([list->string list->string]
	[string->list string->list] )
    (lambda (s)
      (##sys#check-string s)
      (list->string
       (cons #\^
	     (let loop ([cs (string->list s)])
	       (if (null? cs)
		   '(#\$)
		   (let ([c (car cs)]
			 [rest (cdr cs)] )
		     (cond [(char=? c #\*) `(#\. #\* ,@(loop rest))]
			   [(char=? c #\?) (cons '#\. (loop rest))]
			   [(or (char-alphabetic? c) (char-numeric? c)) (cons c (loop rest))]
			   [else `(#\\ ,c ,@(loop rest))] ) ) ) ) ) ) ) ) )

(define grep
  (let ([string-match string-match])
    (lambda (rx lst)
      (##sys#check-string rx)
      (##sys#check-list lst)
      (let loop ([lst lst])
	(if (null? lst)
	    '()
	    (let ([x (car lst)]
		  [r (cdr lst)] )
	      (if (string-match rx x)
		  (cons x (loop r))
		  (loop r) ) ) ) ) ) ) )
