(* $Id: unimap_to_ocaml.ml,v 1.3 2000/08/29 00:48:52 gerd Exp $
 * ----------------------------------------------------------------------
 *
 *)

open Printf;;

let comment_re = Str.regexp "#.*$";;
let space_re = Str.regexp "[ \t\r\n]+";;

let read_unimap_format_a fname f =
  (* Reads a Unicode mapping in format A from a "local" code to Unicode.
   * Returns a list of pairs (localcode, unicode).
   *)
  
  let read_unimap_line() =
    let s = input_line f in    (* may raise End_of_file *)
    let s' = Str.global_replace comment_re "" s in
    let words = Str.split space_re s' in
    match words with
	[] -> raise Not_found
      | [ localcode; unicode ] ->
	  int_of_string localcode, int_of_string unicode
      | _ ->
	  failwith ("File " ^ fname ^ ": Do not know what to do with:\n" ^ s')
  in

  let rec read_following_lines() =
    try
      let localcode, unicode = read_unimap_line() in 
                               (* may raise End_of_file, Not_found *)
      (localcode, unicode) :: read_following_lines()
    with
	Not_found -> read_following_lines()
      | End_of_file -> []
  in

  read_following_lines()
;;


type from_uni_list =
    U_nil
  | U_single of (int * int)
  | U_list of (int * int) list

type from_unicode =
    from_uni_list array;;
  (* A hashtable with fixed size (256). A pair (unicode, localcode) is
   * stored at the position unicode mod 256 in the array.
   *)


let make_bijection unimap =
  (* unimap: a list of pairs (localcode, unicode)
   * returns a pair of arrays (m_to_unicode, m_from_unicode) with:
   *   - m_to_unicode.(localcode) = Some unicode, 
   *                                 if the pair (localcode, unicode) exists
   *     m_to_unicode.(x) = None otherwise
   *   - m_from_unicode.(unicode lsr 8) = [ ...; (unicode,localcode); ... ]
   *)

  let m_to_unicode   = Array.create 256 None in
  let m_from_unicode = Array.create 256 [] in

  List.iter
    (fun (localcode, unicode) ->
       assert(localcode < 256);

       (* Update m_to_unicode: *)
       if m_to_unicode.(localcode) <> None then
	 failwith ("Local code point " ^ string_of_int localcode ^ 
		   " mapped twice");
       m_to_unicode.(localcode) <- Some unicode;

       (* Update m_from_unicode: *)
       let unilow = unicode land 255 in
       if List.mem_assoc unicode (m_from_unicode.(unilow)) then
	 failwith ("Unicode code point " ^ string_of_int unicode ^ 
		   " mapped twice");
       m_from_unicode.(unilow) <- 
         m_from_unicode.(unilow) @ [unicode,localcode];
    )
    unimap;

  m_to_unicode, m_from_unicode
;;


let to_unimap_as_string to_unimap =
  let make_repr x =
    match x with
	None -> -1
      | Some u -> u
  in
  Marshal.to_string (Array.map make_repr to_unimap) [ Marshal.No_sharing ]
;;


let from_unimap_as_string from_unimap =
  let make_repr l =
    match l with
	[]    -> U_nil
      | [u,l] -> U_single(u,l)
      | _     -> U_list l
  in
  let m = Array.map make_repr from_unimap in
  Marshal.to_string m [ Marshal.No_sharing ]
;;


let print_bijection f name m_to_unicode m_from_unicode =
  (* Prints on file f this O'Caml code:
   * let <name>_to_unicode = ...
   * let <name>_from_unicode = ...
   *)
  fprintf f "let %s_to_unicode = lazy (Marshal.from_string \"%s\" 0 : int array);;\n" 
    name 
    (String.escaped (to_unimap_as_string m_to_unicode));

  fprintf f "let %s_from_unicode = lazy (Marshal.from_string \"%s\" 0 : Netmappings.from_uni_list array);;\n "
    name
    (String.escaped (from_unimap_as_string m_from_unicode));
;;


let main() =
  let files = ref [] in
  let outch = ref (lazy stdout) in
  Arg.parse
      [ "-o", Arg.String (fun s -> outch := lazy (open_out s)),
           " <file>   Write result to this file"]
      (fun s -> files := !files @ [s])
      "usage: unimap_to_ocaml file.unimap ...";
  
  (* First read in all unimaps: *)
  let unimaps =
    List.map
      (fun filename ->
	 let mapname = Str.replace_first (Str.regexp "\.unimap$") "" 
			                 (Filename.basename filename) in
	 let f = open_in filename in
	 prerr_endline ("Reading " ^ filename);
	 let unimap = read_unimap_format_a filename f in
	 close_in f;
	 mapname, unimap
      )
      !files
  in

  (* Second compute all bijections: *)
  let bijections =
    List.map
      (fun (mapname, unimap) ->
	 prerr_endline ("Processing " ^ mapname);
	 let to_unicode, from_unicode = make_bijection unimap in
	 mapname, to_unicode, from_unicode
      )
      unimaps
  in

  let out = Lazy.force !outch in
  (* Third output all results: *)
  output_string out "(* WARNING! This is a generated file! *)\n";

  List.iter
    (fun (mapname, to_unicode, from_unicode) ->
       print_bijection out mapname to_unicode from_unicode)
    bijections;
  List.iter
    (fun (mapname, _, _) ->
       fprintf out "Hashtbl.add Netmappings.to_unicode `Enc_%s %s_to_unicode;\n" 
	           mapname mapname;
       fprintf out "Hashtbl.add Netmappings.from_unicode `Enc_%s %s_from_unicode;\n" 
	           mapname mapname;
    )
    (List.rev bijections);
  fprintf out "();;\n";

  close_out out
;;


main();;

(* ======================================================================
 * History:
 * 
 * $Log: unimap_to_ocaml.ml,v $
 * Revision 1.3  2000/08/29 00:48:52  gerd
 * 	Conversion tables are now stored in marshalled form.
 * 	New type for the conversion table Unicode to 8bit.
 *
 * Revision 1.2  2000/08/12 23:54:56  gerd
 * 	Initial revision.
 *
 * 
 *)
