Pages

Saturday, 7 January 2017

EBNF Grammer for Parsing Chrome Bookmarks

The bookmarks html exported by Chrome is not a valid html. It has different rules with a different DTD. Here is an ANTLR 4 grammar for parsing the bookmarks with support for unicode characters in bookmark names.
grammar Bookmarks;
 
document : prolog? misc* meta* misc* dl misc*;

prolog : DTD;

misc 
    : COMMENT 
    | S
    ;

meta 
    : '<' TEXT '>' TEXT '</' TEXT '>'
    | '<' TEXT attribute* '>'
    ;

dl : '<' TEXT '><' TEXT '>' misc* dt* misc* '</' TEXT '><' TEXT '>';

dt 
    : '<' TEXT '><' tag attribute* '>' content '</' tag '>' 
    | '<' TEXT '><' tag attribute* '></' tag '>'
    | dl
    ;

attribute 
    : attributeName '=' attributeValue 
    | S
    ;

tag 
    : H3 
    | TEXT
    ;

attributeName : TEXT;

attributeValue : VAL;

content : TEXT+;

DTD : '<!'.*?'>';

COMMENT : '<!--' .*? '-->' S;

H3 : 'H3';

VAL : '"'.*?'"';

TEXT : [A-Za-z0-9:\/\.@\-_;\s*]+ | NameChar+;

fragment
NameChar
    : NameStartChar
    | '0'..'9'
    | '_'
    | '\u00B7'
    | '\u0300'..'\u036F'
    | '\u203F'..'\u2040'
    ;

fragment
NameStartChar
    : 'A'..'Z' | 'a'..'z'
    | '\u00C0'..'\u00D6'
    | '\u00D8'..'\u00F6'
    | '\u00F8'..'\u02FF'
    | '\u0370'..'\u037D'
    | '\u037F'..'\u1FFF'
    | '\u200C'..'\u200D'
    | '\u2070'..'\u218F'
    | '\u2C00'..'\u2FEF'
    | '\u3001'..'\uD7FF'
    | '\uF900'..'\uFDCF'
    | '\uFDF0'..'\uFFFD'
    ;

S : [ \t\r\n]+ -> skip;

The exported bookmarks sample.
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
     It will be read and overwritten.
     DO NOT EDIT! -->
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
    <DT><H3 ADD_DATE="1481473849" LAST_MODIFIED="1481473992" PERSONAL_TOOLBAR_FOLDER="true">Bookmarks bar</H3>
    <DL><p>
        <DT><H3 ADD_DATE="1481473866" LAST_MODIFIED="1481473967">Test 1</H3>
        <DL><p>
            <DT><A HREF="https://encrypted.google.com/" ADD_DATE="1481473884" ICON="">Google</A>
            <DT><A HREF="https://yandex.ru/" ADD_DATE="1481473892" ICON="">Яндекс</A>
            <DT><A HREF="http://example.com/" ADD_DATE="1481473954">Example Domain</A>
        </DL><p>
        <DT><H3 ADD_DATE="1481473872" LAST_MODIFIED="1481473980">Test 2</H3>
        <DL><p>
            <DT><A HREF="https://duckduckgo.com/" ADD_DATE="1481473902" ICON="">DuckDuckGo</A>
            <DT><A HREF="https://clojure.news/" ADD_DATE="1481473936" ICON="">Clojure News</A>
            <DT><A HREF="http://example.com/" ADD_DATE="1481473955">Example Domain</A>
        </DL><p>
        <DT><A HREF="https://yandex.ru/" ADD_DATE="1481473893" ICON="">Яндекс</A>
        <DT><A HREF="http://www.echojs.com/" ADD_DATE="1481473986" ICON=""></A>
        <DT><A HREF="https://clojure.news/" ADD_DATE="1481473992" ICON=""></A>
        <DT><H3 ADD_DATE="1481474004" LAST_MODIFIED="1481477692">Test 3</H3>
        <DL><p>
            <DT><A HREF="https://encrypted.google.com/" ADD_DATE="1481474004" ICON="">Google</A>
            <DT><A HREF="https://duckduckgo.com/" ADD_DATE="1481474004" ICON="">DuckDuckGo</A>
            <DT><A HREF="https://clojure.news/" ADD_DATE="1481474004" ICON="">Clojure News</A>
            <DT><H3 ADD_DATE="1481477681" LAST_MODIFIED="1481477681">Test 4</H3>
            <DL><p>
                <DT><A HREF="https://clojure.news/" ADD_DATE="1481477681" ICON="">Clojure News</A>
                <DT><A HREF="https://news.ycombinator.com/" ADD_DATE="1481477681" ICON="">Hacker News</A>
                <DT><A HREF="http://example.com/" ADD_DATE="1481477681">Example Domain</A>
            </DL><p>
            <DT><A HREF="https://news.ycombinator.com/" ADD_DATE="1481474004" ICON="">Hacker News</A>
            <DT><A HREF="http://example.com/" ADD_DATE="1481474004">Example Domain</A>
        </DL><p>
    </DL><p>
</DL><p>
clj-antlr library can be used to get the parse tree out of the grammer. Snippet to get the parse tree below. Use compiled version of the grammar for better performance.
(def bm (antlr/parser "/home/kadaj/dev/clojure/bookmarks-parser/grammar/Bookmarks.g4"))
(pprint (bm (slurp "/home/kadaj/dev/clojure/bookmarks-parser/resources/bookmarks.html")))
Which produces the following parse tree.
(:document
 (:prolog "")
 (:misc
  "\n")
 (:meta
  "<"
  "META"
  (:attribute
   (:attributeName "HTTP-EQUIV")
   "="
   (:attributeValue "\"Content-Type\""))
  (:attribute
   (:attributeName "CONTENT")
   "="
   (:attributeValue "\"text/html; charset=UTF-8\""))
  ">")
 (:meta "<" "TITLE" ">" "Bookmarks" "")
 (:meta "<" "H1" ">" "Bookmarks" "")
 (:dl
  "<"
  "DL"
  "><"
  "p"
  ">"
  (:dt
   "<"
   "DT"
   "><"
   (:tag "H3")
   (:attribute
    (:attributeName "ADD_DATE")
    "="
    (:attributeValue "\"1481473849\""))
   (:attribute
    (:attributeName "LAST_MODIFIED")
    "="
    (:attributeValue "\"1481473992\""))
   (:attribute
    (:attributeName "PERSONAL_TOOLBAR_FOLDER")
    "="
    (:attributeValue "\"true\""))
   ">"
   (:content "Bookmarks" "bar")
   "")
  (:dt
   (:dl
    "<"
    "DL"
    "><"
    "p"
    ">"
    (:dt
     "<"
     "DT"
     "><"
     (:tag "H3")
     (:attribute
      (:attributeName "ADD_DATE")
      "="
      (:attributeValue "\"1481473866\""))
     (:attribute
      (:attributeName "LAST_MODIFIED")
      "="
      (:attributeValue "\"1481473967\""))
     ">"
     (:content "Test" "1")
     "")
    (:dt
     (:dl
      "<"
      "DL"
      "><"
      "p"
      ">"
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"https://encrypted.google.com/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481473884\""))
       (:attribute
        (:attributeName "ICON")
        "="
        (:attributeValue "\"\""))
       ">"
       (:content "Google")
       "")
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"https://yandex.ru/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481473892\""))
       (:attribute
        (:attributeName "ICON")
        "="
        (:attributeValue "\"\""))
       ">"
       (:content "Яндекс")
       "")
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"http://example.com/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481473954\""))
       ">"
       (:content "Example" "Domain")
       "")
      "<"
      "p"
      ">"))
    (:dt
     "<"
     "DT"
     "><"
     (:tag "H3")
     (:attribute
      (:attributeName "ADD_DATE")
      "="
      (:attributeValue "\"1481473872\""))
     (:attribute
      (:attributeName "LAST_MODIFIED")
      "="
      (:attributeValue "\"1481473980\""))
     ">"
     (:content "Test" "2")
     "")
    (:dt
     (:dl
      "<"
      "DL"
      "><"
      "p"
      ">"
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"https://duckduckgo.com/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481473902\""))
       (:attribute
        (:attributeName "ICON")
        "="
        (:attributeValue "\"\""))
       ">"
       (:content "DuckDuckGo")
       "")
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"https://clojure.news/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481473936\""))
       (:attribute
        (:attributeName "ICON")
        "="
        (:attributeValue "\"\""))
       ">"
       (:content "Clojure" "News")
       "")
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"http://example.com/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481473955\""))
       ">"
       (:content "Example" "Domain")
       "")
      "<"
      "p"
      ">"))
    (:dt
     "<"
     "DT"
     "><"
     (:tag "A")
     (:attribute
      (:attributeName "HREF")
      "="
      (:attributeValue "\"https://yandex.ru/\""))
     (:attribute
      (:attributeName "ADD_DATE")
      "="
      (:attributeValue "\"1481473893\""))
     (:attribute (:attributeName "ICON") "=" (:attributeValue "\"\""))
     ">"
     (:content "Яндекс")
     "")
    (:dt
     "<"
     "DT"
     "><"
     (:tag "A")
     (:attribute
      (:attributeName "HREF")
      "="
      (:attributeValue "\"http://www.echojs.com/\""))
     (:attribute
      (:attributeName "ADD_DATE")
      "="
      (:attributeValue "\"1481473986\""))
     (:attribute (:attributeName "ICON") "=" (:attributeValue "\"\""))
     ">")
    (:dt
     "<"
     "DT"
     "><"
     (:tag "A")
     (:attribute
      (:attributeName "HREF")
      "="
      (:attributeValue "\"https://clojure.news/\""))
     (:attribute
      (:attributeName "ADD_DATE")
      "="
      (:attributeValue "\"1481473992\""))
     (:attribute (:attributeName "ICON") "=" (:attributeValue "\"\""))
     ">")
    (:dt
     "<"
     "DT"
     "><"
     (:tag "H3")
     (:attribute
      (:attributeName "ADD_DATE")
      "="
      (:attributeValue "\"1481474004\""))
     (:attribute
      (:attributeName "LAST_MODIFIED")
      "="
      (:attributeValue "\"1481477692\""))
     ">"
     (:content "Test" "3")
     "")
    (:dt
     (:dl
      "<"
      "DL"
      "><"
      "p"
      ">"
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"https://encrypted.google.com/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481474004\""))
       (:attribute
        (:attributeName "ICON")
        "="
        (:attributeValue "\"\""))
       ">"
       (:content "Google")
       "")
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"https://duckduckgo.com/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481474004\""))
       (:attribute
        (:attributeName "ICON")
        "="
        (:attributeValue "\"\""))
       ">"
       (:content "DuckDuckGo")
       "")
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"https://clojure.news/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481474004\""))
       (:attribute
        (:attributeName "ICON")
        "="
        (:attributeValue "\"\""))
       ">"
       (:content "Clojure" "News")
       "")
      (:dt
       "<"
       "DT"
       "><"
       (:tag "H3")
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481477681\""))
       (:attribute
        (:attributeName "LAST_MODIFIED")
        "="
        (:attributeValue "\"1481477681\""))
       ">"
       (:content "Test" "4")
       "")
      (:dt
       (:dl
        "<"
        "DL"
        "><"
        "p"
        ">"
        (:dt
         "<"
         "DT"
         "><"
         (:tag "A")
         (:attribute
          (:attributeName "HREF")
          "="
          (:attributeValue "\"https://clojure.news/\""))
         (:attribute
          (:attributeName "ADD_DATE")
          "="
          (:attributeValue "\"1481477681\""))
         (:attribute
          (:attributeName "ICON")
          "="
          (:attributeValue "\"\""))
         ">"
         (:content "Clojure" "News")
         "")
        (:dt
         "<"
         "DT"
         "><"
         (:tag "A")
         (:attribute
          (:attributeName "HREF")
          "="
          (:attributeValue "\"https://news.ycombinator.com/\""))
         (:attribute
          (:attributeName "ADD_DATE")
          "="
          (:attributeValue "\"1481477681\""))
         (:attribute
          (:attributeName "ICON")
          "="
          (:attributeValue "\"\""))
         ">"
         (:content "Hacker" "News")
         "")
        (:dt
         "<"
         "DT"
         "><"
         (:tag "A")
         (:attribute
          (:attributeName "HREF")
          "="
          (:attributeValue "\"http://example.com/\""))
         (:attribute
          (:attributeName "ADD_DATE")
          "="
          (:attributeValue "\"1481477681\""))
         ">"
         (:content "Example" "Domain")
         "")
        "<"
        "p"
        ">"))
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"https://news.ycombinator.com/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481474004\""))
       (:attribute
        (:attributeName "ICON")
        "="
        (:attributeValue "\"\""))
       ">"
       (:content "Hacker" "News")
       "")
      (:dt
       "<"
       "DT"
       "><"
       (:tag "A")
       (:attribute
        (:attributeName "HREF")
        "="
        (:attributeValue "\"http://example.com/\""))
       (:attribute
        (:attributeName "ADD_DATE")
        "="
        (:attributeValue "\"1481474004\""))
       ">"
       (:content "Example" "Domain")
       "")
      "<"
      "p"
      ">"))
    "<"
    "p"
    ">"))
  "<"
  "p"
  ">"))

No comments:

Post a Comment