REBOL [
   Title:   "Convert HTML to text"
   Author:  "Dick Whiting"
   Email:   dwhiting@europa.com
   Home:    http://www.europa.com/~dwhiting
   Date:    10-Jan-1999
   File:    %scripts/striphtml.r
   Version: 1.0.1

   Requirements: {

      Uses functions from dwstrings.rlib
      
   }

   Purpose: {

      Convert a string containing html code to straight text.

   }

   Changes: {

      Trivial bug fix.
   }

   Comment: {

      There are two blocks TAGS: and CODES: that you can add to if needed.
      TAGS: contains html tags that require special handling, such as inserting
      line breaks.

      CODES: contains some common html codes that get converted to regular
      Latin-1 characters. 

      The more you add, the longer the processing will take.

   }

   Functions: {

      striphtml -- single function to convert html to text
      
   }

   Uninstall: ["scripts" "striphtml.r"]

]

striphtml: func [str] [{

;**************************************************************************;
;*               A VERY basic function to convert html to text            *;
;**************************************************************************;
;*                                                                        *;
;*  Format:                                                               *;
;*                                                                        *;
;*   striphtml str           ; returns a string stripped of html code     *;
;*                                                                        *;
;*  Example usages:                                                       *;
;*                                                                        *;
;*   striphtml read %t:/welcome.html                     ; read from disk *;
;*   striphtml read http://www.rebol.com                 ; read from net  *;
;*                                                                        *;
;*  NOTE: this works well with my 'more' function 'more striphtml ...'    *;
;**************************************************************************;
}

   use [chk-space nlines spacechar dashes textstr tempstr htmlstr tags codes 
        tag tagidx blkhtml blkcnt bpos cr lf crlf] [

      chk-space: func [str] [
         if ((last str) <> newline) and ((last str) <> spacechar) [
            insert tail str " "
         ]
      ]

      nlines: func [num] [
         loop num [insert tail textstr newline]
      ]

      spacechar: make char! 32
      dashes: make string! 78
      dashes: copies "-" 77
      dashes: head insert tail dashes newline
      textstr: make string! str
      htmlstr: make string! (length? str)
      tempstr: make string! 0
      tag: make string! 0
      lf:  make char! 10
      cr:  make char! 13
      crlf: compress form [cr lf]

;**************************************************************************;
;*         Convert CR/LF to newlines, if not a Wxx Rebol version          *;
;**************************************************************************;

      if found? find str crlf [
         if (fourth rebol/version) <> 3 [
            textstr: replstr str crlf form newline
         ]
      ]            

;**************************************************************************;
;*             Common html tags requiring specialing handling             *;
;**************************************************************************;

      tags: [

         "<body"  [nlines 1]
         "<br"    [nlines 1]
         "<div"   [nlines 2]
         "<dl"    [nlines 1]
         "<dt"    [nlines 2]
         "<form"  [nlines 2]
         "<h1"    [nlines 2]
         "<h2"    [nlines 2]
         "<h3"    [nlines 2]
         "<h4"    [nlines 2]
         "<h5"    [nlines 2]
         "<h6"    [nlines 2]
         "<hr"    [insert tail textstr newline
                   insert tail textstr dashes]
         "<li"    [nlines 1]
         "<ol"    [nlines 2]
         "<p"     [nlines 2]

         "<pre"   [insert tail textstr copy/part htmlstr find htmlstr "</pre>"
                   htmlstr: remove/part htmlstr find htmlstr "</pre>"]

         "<table" [nlines 1]
         "<title" [nlines 1
                   insert tail textstr "Title: "]
         "<tr"    [nlines 2]
         "<ul"    [nlines 2]
      ]

;**************************************************************************;
;*                            Common html codes                           *;
;*            Add to this list as needed, the more; the slower            *;
;**************************************************************************;

      codes: [

         "&quot;" {"}
         "&nbsp;" " "
         "&amp;"  "&"
         "&lt;"   "<"
         "&gt;"   ">"
         "&copy;" ""
      ]

;**************************************************************************;
;*         Create one block with one string per line w/o newlines         *;
;**************************************************************************;

      blkhtml: scanstr textstr newline
      textstr: clear textstr
      textstr: head insert tail textstr newline

;**************************************************************************;
;*  Build a long string from the blocked html code. Add spaces as needed  *;
;*         Here's where to add BACK newlines for any <PRE> blocks         *;
;**************************************************************************;

      blkcnt: 1
      while [blkcnt <= (length? blkhtml)] [
         tempstr: pick blkhtml blkcnt
         if found? find tempstr "<pre>" [
            insert tail tempstr newline
            insert tail htmlstr trim/auto tempstr
            blkcnt: blkcnt + 1
            while [blkcnt < (length? blkhtml)] [
               if found? tag: find tempstr: pick blkhtml blkcnt "</pre>" [
                  tagidx: (index? tag) - 1
                  insert tail htmlstr copy/part tempstr tagidx
                  tempstr: remove/part tempstr tagidx
                  if (length? tempstr) = 6 [
                     insert tail htmlstr head tempstr
                     blkcnt: blkcnt + 1
                  ]
                  else [
                     remove/part skip blkhtml (blkcnt - 1) 1
                     insert skip blkhtml (blkcnt - 1) tempstr
                  ]
                  break
               ]
               else [
                  insert tail htmlstr tempstr
                  insert tail htmlstr newline
                  blkcnt: blkcnt + 1
               ]
            ]
         ]
         else [
            tempstr: trim/auto tempstr
            if (length? tempstr) > 0 [
               insert tail htmlstr tempstr
               chk-space htmlstr
            ]
            blkcnt: blkcnt + 1
         ]
      ]

      htmlstr: head htmlstr

;**************************************************************************;
;*                             Process string                             *;
;*       Copy straight text to output string, handle tag processing       *;
;**************************************************************************;

      while [found? tag: find htmlstr "<"] [
         
         tagidx: (index? tag) - 1
         tempstr: trim/auto copy/part head htmlstr tagidx

         if (length? tempstr) > 0 [
            chk-space textstr
            insert tail textstr tempstr
         ] 
         htmlstr: remove/part htmlstr tagidx

         tagidx: index? find htmlstr ">"
         tempstr: trim/auto copy/part htmlstr tagidx
         htmlstr: remove/part htmlstr tagidx

         if (found? tag: find tempstr " ") [tagidx: index? tag]
         else [tagidx: length? tempstr]

         tag: copy/part tempstr (tagidx - 1)
         if (copy/part tag 2) <> "</" [
            if (found? find tags tag) [
               do select tags tag
            ]
            else [
               if tag = "<img" [
                  if (found? find tempstr "alt=") [
                     tempstr: remove/part tempstr find tempstr "alt="
                     tempstr: remove/part tempstr (index? find tempstr {"})
                     tempstr: copy/part   tempstr find tempstr {"}
                     chk-space textstr
                     insert tail textstr "["
                     insert tail textstr tempstr
                     insert tail textstr "]"
                  ]
                  else [
                     chk-space textstr
                     insert tail textstr "[image]"
                  ]
               ]
            ]
         ]
      ]

      if (length? htmlstr) > 0 [insert tail textstr htmlstr]

;**************************************************************************;
;*      Replace common html codes with their actual Latin-1 character     *;
;**************************************************************************;

   foreach [code char] codes [
      textstr: replstr textstr code char
   ]

;**************************************************************************;
;*                 Replace numeric codes of &#xxx; format                 *;
;**************************************************************************;

   tagidx: 0

   while [found? tag: find skip textstr tagidx "&#"] [
      tagidx: index? tag
      tempstr: copy/part skip textstr (tagidx + 1) find textstr ";"
      if not none? cchar: make integer! tempstr [
         remove/part skip textstr (tagidx - 1) ((length? tempstr) + 3)
         insert skip textstr (tagidx - 1) make char! cchar
      ]
   ]

;**************************************************************************;
;*            Create a block of strings from the processed text           *;
;* Break strings longer than 80 into 80 character pieces, and build a     *;
;* final string to return.                                                *;
;**************************************************************************;

      if (length? head textstr) > 0 [
         blkhtml: scanstr textstr newline
         htmlstr: clear head htmlstr
         for n 1 (length? blkhtml) 1 [
            tempstr: pick blkhtml n
            while [(length? tempstr) > 79] [
               bpos: lastpos/before " " tempstr 80
               if bpos = 0 [bpos: 79]
               insert tail htmlstr copy/part tempstr bpos
               insert tail htmlstr newline
               tempstr: head remove/part tempstr bpos
            ]
            insert tail htmlstr tempstr
            insert tail htmlstr newline
         ] 
         return head htmlstr
      ]
      else [return none!]
   ]
]

""
