(* An HTML lexer *)
{
open Html

type tagtoken =
   Attribute of string * string
 | Closetag of int

(* Smart hack to make lexers reentrant.
 * Make each action a function taking "private" data as argument.
 * Invoke each action with additionnal argument.
 *  
 * This works only because calls to actions in csllex generated code
 * are terminal.
 *)

type t = {
  buffer : Ebuffer.t;
  mutable start : int
  }

let new_data () = {
  buffer = Ebuffer.create 512;
  start = 0 
  }

let strict = ref false
let verbose = ref false
}

rule html = parse
   '<' 
    { (fun lexdata ->
	 lexdata.start <- Lexing.lexeme_start lexbuf;
	 tag lexbuf lexdata
      )}
 | "<!>" 
    { (fun lexdata ->
	   Comment "",
      	   Loc (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)
       )}
 | "<!--" ['-']+ '>' 
    { (fun lexdata ->
      let lexeme = Lexing.lexeme lexbuf in
      	match (String.length lexeme - 3) mod 4 with
	  0 -> (* <!-- -- -- --> *)
           Comment "",
      	   Loc (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)
        | 1 -> (* <!-- -- -> : (without sp) incorrect *)
	   if !strict then
	    raise (Html_Lexing ("invalid comment", 
      	       	       	       	  Lexing.lexeme_start lexbuf))
	   else begin
	    if !verbose then begin
	      Printf.fprintf stderr "Invalid comment at pos %d\n"
      	       	   (Lexing.lexeme_start lexbuf);
	      flush stderr
	      end;
           Comment "",
      	   Loc (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)
	   end
        | 2 (* <!-- -- --> (without sp) *)
        | 3 -> (* <!-- -- ---> (without sp) *)
	   if !strict then begin
            lexdata.start <- Lexing.lexeme_start lexbuf;
	    Ebuffer.reset lexdata.buffer;
	    Ebuffer.output_string lexdata.buffer "->";
	    comment lexbuf lexdata
	    end
	   else begin
	    if !verbose then begin
	      Printf.fprintf stderr "Accepted as unclosed comment\n";
	      flush stderr
	      end;
	   Comment "",
      	   Loc (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)
      	   end )}
      
 | "<!--" 
    { (fun lexdata ->
      lexdata.start <- Lexing.lexeme_start lexbuf;
      comment lexbuf lexdata)}
 | "<!" ['D' 'd']['O' 'o']['C' 'c']['T' 't']['Y' 'y']['P' 'p']['E' 'e'] 
         [^ '>']* '>'
    { (fun lexdata ->
       Doctype (Lexing.lexeme lexbuf),
       Loc (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf) )}

 | eof 
    { (fun lexdata -> EOF,
       Loc (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf) )}
 | '&' 
    { (fun lexdata ->
     lexdata.start <- Lexing.lexeme_start lexbuf;
     Ebuffer.reset lexdata.buffer;
     Ebuffer.output_string lexdata.buffer (ampersand lexbuf lexdata);
     text lexbuf lexdata )}
 | "\r\n" 
    { (fun lexdata ->
     lexdata.start <- Lexing.lexeme_start lexbuf;
     Ebuffer.reset lexdata.buffer;
     Ebuffer.output_char lexdata.buffer '\n'; 
     text lexbuf lexdata )}
 | "\r" 
    { (fun lexdata ->
     lexdata.start <- Lexing.lexeme_start lexbuf;
     Ebuffer.reset lexdata.buffer;
     Ebuffer.output_char lexdata.buffer '\n'; 
     text lexbuf lexdata )}
 | _ { (fun lexdata ->
     lexdata.start <- Lexing.lexeme_start lexbuf;
     Ebuffer.reset lexdata.buffer;
     Ebuffer.output_char lexdata.buffer (Lexing.lexeme_char lexbuf 0); 
     text lexbuf lexdata )}

(* skip until next -- included *)
(* then look for next comment *)
and comment = parse
    "--" 
    { (fun lexdata -> next_comment lexbuf lexdata)}
  | _  
    { (fun lexdata ->
       Ebuffer.output_char lexdata.buffer (Lexing.lexeme_char lexbuf 0);
       comment lexbuf lexdata )}
  | ""
    { (fun lexdata ->
   raise (Html_Lexing ("unterminated comment", Lexing.lexeme_start lexbuf))
    )}

and next_comment = parse
    [' ' '\t' '\r' '\n']* "--"
    { (fun lexdata -> comment lexbuf lexdata )}
  | [' ' '\t' '\r' '\n']* '>'
    { (fun lexdata ->
      Comment (Ebuffer.get lexdata.buffer),
      Loc(lexdata.start, Lexing.lexeme_end lexbuf) )}
  | "" 
    { (fun lexdata ->
       raise (Html_Lexing ("invalid comment", Lexing.lexeme_start lexbuf)))}

and text = parse
   [^ '<' '&' '\r']*
    { (fun lexdata ->
      Ebuffer.output_string lexdata.buffer (Lexing.lexeme lexbuf);	
      PCData (Ebuffer.get lexdata.buffer), 
      Loc(lexdata.start, Lexing.lexeme_end lexbuf)
      )}
 | [^ '<' '&' '\r']* '&'
    { (fun lexdata ->
       let lexeme = Lexing.lexeme lexbuf in
      	 Ebuffer.output lexdata.buffer lexeme 0 (String.length lexeme -1) ;
	 Ebuffer.output_string lexdata.buffer (ampersand lexbuf lexdata); 
      	 text lexbuf lexdata )}
 | [^ '<' '&' '\r']* '\r' '\n'
    { (fun lexdata ->
      	let lexeme = Lexing.lexeme lexbuf in
      	 Ebuffer.output lexdata.buffer lexeme 0 (String.length lexeme - 2);
         Ebuffer.output_char lexdata.buffer '\n';
      	 text lexbuf lexdata )}
 | [^ '<' '&' '\r']* '\r'
    { (fun lexdata ->
        let lexeme = Lexing.lexeme lexbuf in
      	 Ebuffer.output lexdata.buffer lexeme 0 (String.length lexeme - 1);
         Ebuffer.output_char lexdata.buffer '\n';
      	 text lexbuf lexdata )}
 (* no default case needed *)

and ampersand = parse
   '#' ['0'-'9']+ ';' 
    { (fun lexdata ->
        let lexeme = Lexing.lexeme lexbuf in
      	let code = String.sub lexeme 1 (String.length lexeme - 2) in
	let c = Char.chr (int_of_string code) in
	 String.make 1 c
      )}
 | ['a'-'z' 'A'-'Z'] ['a'-'z' 'A'-'Z' '0'-'9']* ';'
    { (fun lexdata ->
        let lexeme = Lexing.lexeme lexbuf in
      	let entity = String.sub lexeme 0 (String.length lexeme - 1) in
	  begin try 
	    get_entity entity
	  with (* 4.2.1 undeclared markup error handling *)
	    Not_found ->
	      ("&" ^ lexeme)
          end
      )}
  (* terminating ; is not required if next character could not be 
     part of the lexeme *)
 | '#' ['0'-'9']+
    { (fun lexdata ->
        let lexeme = Lexing.lexeme lexbuf in
      	let code = String.sub lexeme 1 (String.length lexeme - 1) in
	let c = Char.chr (int_of_string code) in
	  String.make 1 c
      )}
 | ['a'-'z' 'A'-'Z'] ['a'-'z' 'A'-'Z' '0'-'9']*
    { (fun lexdata ->
      	let lexeme = Lexing.lexeme lexbuf in
	  try get_entity lexeme
	  with (* 4.2.1 undeclared markup error handling *)
	    Not_found -> ("&"^lexeme)
      )}
  (* Tolerance ... *)
  | ""
    { (fun lexdata -> "&" )}


(* TODO 2.0: 
 *   syntax for SHORTTAG YES (need to know the DTD for this !).
 *
 *)
and tag = parse
   ['a'-'z' 'A'-'Z' '0'-'9' '.' '-']+
    { (fun lexdata ->
         let tagname = Mstring.lowercase (Lexing.lexeme lexbuf)
         and attribs = ref []  in
	 let rec read_attribs () =
	   match attrib lexbuf lexdata with
	     Closetag n -> n
	   | Attribute(p1, p2) ->
      	       attribs := (p1, p2) :: !attribs; read_attribs() in
	 let e = read_attribs() in
          OpenTag {tag_name = tagname; attributes = List.rev !attribs },
	  Loc(lexdata.start, e)
        )}
  
  | '/' ['a'-'z' 'A'-'Z' '0'-'9' '.' '-']+
    { (fun lexdata ->
        let lexeme = Lexing.lexeme lexbuf in
	let e = skip_to_close lexbuf lexdata in
	 CloseTag (Mstring.lowercase 
                      (String.sub lexeme 1 (String.length lexeme - 1))),
      	 Loc(lexdata.start,e))}
(* Tolerance *)
  | ""  
    { (fun lexdata ->
      	  Ebuffer.reset lexdata.buffer;
          Ebuffer.output_char lexdata.buffer '<';
          text lexbuf lexdata)}

and attrib = parse
    [' ' '\t' '\n' '\r']+ 
    { (fun lexdata -> attrib lexbuf lexdata )}
  | ['a'-'z' 'A'-'Z' '0'-'9' '.' '-']+ 
    { (fun lexdata ->
      	let name = Mstring.lowercase(Lexing.lexeme lexbuf) in
      	match tagattrib lexbuf lexdata with
      	  Some s -> Attribute (name, s)
      	| None -> Attribute (name, name)
      )}
   (* added '_' so we can parse Netscape bookmark files, 
      but it should NOT be there *)
  | ['a'-'z' 'A'-'Z' '0'-'9' '.' '-' '_']+ 
    { (fun lexdata ->
       let name = Mstring.lowercase(Lexing.lexeme lexbuf) in
      	 if !strict then
      	  raise (Html_Lexing ("illegal attribute name: " ^ name,
                                Lexing.lexeme_start lexbuf))
	 else begin
	  if !verbose then begin
	    Printf.fprintf stderr "Illegal attribute name: %s at pos %d\n"
                 name (Lexing.lexeme_start lexbuf);
	    flush stderr
	    end;
	  match tagattrib lexbuf lexdata with
	    Some s -> Attribute (name, s)
	  | None -> Attribute (name, name)
      	 end
      )}
  | '>'
    { (fun lexdata -> Closetag (Lexing.lexeme_end lexbuf) )}
  | _
    { (fun lexdata ->
   raise (Html_Lexing ("invalid attribute name", Lexing.lexeme_start lexbuf))
   )}

and tagattrib = parse
    [' ' '\t' '\n' '\r']* '=' [' ' '\t' '\n' '\r']*
    { (fun lexdata -> Some (attribvalue lexbuf lexdata) )}
  | "" 
    { (fun lexdata -> None )}
  
(* This should be dependent on the attribute name *)
(* people often forget to quote, so try to do something about it *)
(* but if a quote is not closed, you are dead *)
and attribvalue = parse
    ['a'-'z' 'A'-'Z' '0'-'9' '.' '-']+ 
    { (fun lexdata -> Lexing.lexeme lexbuf )}
  | '"' 
    { (fun lexdata ->
       Ebuffer.reset lexdata.buffer; inquote lexbuf lexdata )}
  | '\''
    { (fun lexdata ->
       Ebuffer.reset lexdata.buffer; insingle lexbuf lexdata )}
  | [^ '"' '\''] [^ ' ' '\t' '\n' '\r' '>']*
    { (fun lexdata ->
       let v = Lexing.lexeme lexbuf in
      	 if !strict then
      	  raise (Html_Lexing ("illegal attribute val: " ^ v,
                                Lexing.lexeme_start lexbuf))
	 else begin
	  if !verbose then begin
	    Printf.fprintf stderr "Illegal attribute val: %s at pos %d\n" 
                v (Lexing.lexeme_start lexbuf);
	    flush stderr
	    end;
	  v
      	 end)}
  | _ 
    { (fun lexdata ->
        raise (Html_Lexing ("illegal attribute val",
                              Lexing.lexeme_start lexbuf)) )}

and inquote = parse
    [^ '"' '&']+
    { (fun lexdata ->
       	 Ebuffer.output_string lexdata.buffer (Lexing.lexeme lexbuf);
      	 inquote lexbuf lexdata )}
  | '"'
    { (fun lexdata ->
      	 beautify true (Ebuffer.get lexdata.buffer) )}
  | '&'
    { (fun lexdata ->
      	Ebuffer.output_string lexdata.buffer (ampersand lexbuf lexdata);
      	inquote lexbuf lexdata )}
  | ""
    { (fun lexdata ->
     raise (Html_Lexing ("unclosed \"", Lexing.lexeme_start lexbuf))
     )}
  
and insingle = parse
    [^ '\'' '&']+
    { (fun lexdata ->
      	Ebuffer.output_string lexdata.buffer (Lexing.lexeme lexbuf);
      	insingle lexbuf lexdata )}
  | '\''
    { (fun lexdata -> beautify true (Ebuffer.get lexdata.buffer) )}
  | '&'
    { (fun lexdata ->
      	Ebuffer.output_string lexdata.buffer (ampersand lexbuf lexdata);
      	insingle lexbuf lexdata )}
  | ""
    { (fun lexdata ->
    raise (Html_Lexing ("unclosed '", Lexing.lexeme_start lexbuf))
    )}

and skip_to_close = parse
   [^'>']* '>' { (fun lexdata -> Lexing.lexeme_end lexbuf )}
  | "" { raise (Html_Lexing ("unterminated tag", 
      	       	       	       	  Lexing.lexeme_start lexbuf)) }

and cdata = parse
   [^ '<']* (['<']+ [^ '/']) ? { 
      (fun lexdata -> 
      	CData(Lexing.lexeme lexbuf),
        Loc(Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf))}
      
  | "</" ['a'-'z' 'A'-'Z' '0'-'9' '.' '-']+
    { (fun lexdata ->
        let lexeme = Lexing.lexeme lexbuf in
	let e = skip_to_close lexbuf lexdata in
	 CloseTag (Mstring.lowercase 
                      (String.sub lexeme 2 (String.length lexeme - 2))),
        Loc(Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf))}
  | "</" {
      (fun lexdata ->
      	CData(Lexing.lexeme lexbuf),
        Loc(Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) }
    
