;;; -*- scheme -*-
;;;
;;; scmail.bayesian-filter - Bayesian spam filter experiment
;;;
;;;  Copyright(C) 2003 by Shiro Kawai (shiro@acm.org)
;;;
;;;  Permission to use, copy, modify, distribute this software and
;;;  accompanying documentation for any purpose is hereby granted,
;;;  provided that existing copyright notices are retained in all
;;;  copies and that this notice is included verbatim in all
;;;  distributions.
;;;  This software is provided as is, without express or implied
;;;  warranty.  In no circumstances the author(s) shall be liable
;;;  for any damages arising out of the use of this software.
;;;

(define-module scmail.bayesian-filter
  (use srfi-1)
  (use srfi-2)
  (use srfi-13)
  (use srfi-14)
  (use rfc.822)
  (use rfc.base64)
  (use rfc.quoted-printable)
  (use gauche.charconv)
  (use gauche.parameter)
  (use gauche.sequence)
  (use file.util)
  (use binary.pack)
  (use util.list)
  (use dbm)
  (use dbm.gdbm) ;; should be customizable
  (use scmail.config)
  (use scmail.mail)
  (use scmail.util)
  (export spamness-of-word
          spamness-of-mail
          mail-is-spam?

          with-token-table
          token-table-index-of-spam
          token-table-index-of-nonspam
          token-table-collect-words
          token-table-discard-words
          token-table-special-key-prefix 
          token-table-number-of-values
          token-table-languages
          token-table-for-each
          token-table-message-count 
          token-table-token-count
          token-table-cache-flush
          token-table-cache-length

          ;; for backward compatibility
          load-prob-tables
          convert-database
          )
  )
(select-module scmail.bayesian-filter)

;;==========================================================
;; Probability table
;;

(define-constant *dbm-class* <gdbm>) ;; should be customizable

(define (token-table-special-key-prefix) " %")

(define-constant *message-count-key* (string-append 
                                      (token-table-special-key-prefix)
                                      "message-count"))

(define (token-table-languages) '(#t ja))
(define (token-table-number-of-values)
  (* (length (token-table-languages)) 2))

(define-constant *value-packer* (make-string (token-table-number-of-values) #\V))

(define-class <token-table> ()
  ((db     :init-keyword :db)
   (rwmode :init-keyword :rwmode)
   (cache  :initform (make-hash-table 'string=?))
   (message-count :init-keyword :message-count)
   ))

(define (pack-value v)
  (pack *value-packer* (vector->list v) :to-string? #t))
(define (unpack-value s)
  (list->vector (unpack *value-packer* :from-string s)))

(define (open-token-table file rwmode)
  (define (get-values db key)
    (cond ((dbm-get db key #f) => unpack-value)
          (else (make-vector (token-table-number-of-values) 0))))
  (let* ((db     (dbm-open *dbm-class* :path file :rw-mode rwmode))
         (mcount (get-values db *message-count-key*)))
    (make <token-table> :db db :rwmode rwmode
          :message-count mcount)))

(define (minus->zero val)
  (max 0 val))

;; FIXME: Making a huge list just to count is too wild.        
(define (token-table-cache-length)
  (length (hash-table-keys (ref (token-table) 'cache))))

(define (token-table-cache-flush . proc)
  (let ((proc (get-optional proc (lambda (counter) #t)))
        (counter 1))
    (unless (and (memv (ref (token-table) 'rwmode) '(:write :create))
                 (not (dbm-closed? (ref (token-table) 'db))))
      (errorf "token table is not writable or opened"))
    (hash-table-for-each
     (ref (token-table) 'cache)
     (lambda (k v)
       (let* ((s (dbm-get (ref (token-table) 'db) k #f))
              (newv (map-to <vector> minus->zero
                            (if s
                              (map-to <vector> + v (unpack-value s))
                              v))))
         (if (every zero? (vector->list newv))
           (dbm-delete! (ref (token-table) 'db) k)
           (dbm-put! (ref (token-table) 'db) k (pack-value newv)))
         (proc counter)
         (inc! counter))))
    (slot-set! (token-table) 'cache (make-hash-table 'string=?))
    (for-each 
     (lambda (key slot)
       (dbm-put! (ref (token-table) 'db) key 
                 (pack-value (map-to <vector>
                                     minus->zero
                                     (ref (token-table) slot)))))
     (list *message-count-key*)
     (list 'message-count))))

(define (close-token-table tab)
  (dbm-close (ref tab 'db))
  (token-table #f))

(define token-table (make-parameter #f))

(define (with-token-table file rwmode thunk)
  ;; NB: this should really be an unwind-protect.
  ;; Waiting for its implementation...
  (dynamic-wind
   (lambda () (token-table (open-token-table file rwmode)))
   thunk
   (lambda () (close-token-table (token-table)))))

;; Open token table for reading.
;; We keep the name for backward compatibility.
(define (load-prob-tables file) #t)

(define (load-token-table-if-not-loaded)
  (let1 file (scmail-config-get-path 'token-table)
    (if (or (not (token-table)) (dbm-closed? (ref (token-table) 'db)))
      (let1 realfile (cond ((string-suffix? ".dbm" file) file)
                           ((file-exists? #`",|file|.dbm") #`",|file|.dbm")
                           ((string=? (string-complete->incomplete
                                       (with-input-from-file file
                                         (cut read-block 10)))
                                      #*";;-*-Schem")
                            (errorf "Database file ~s seems to have an old format.  Run scbayes --update-db" file))
                           (else file))
        (if (file-exists? realfile)
          (token-table (open-token-table realfile :read)))))))

(define (process-token type token sign . maybe-count)
  (let ((tab (token-table))
        (cnt (get-optional maybe-count 1)))
    (or (and-let* ((v (hash-table-get (ref tab 'cache) token #f)))
          (inc! (ref v type) (sign cnt)))
        (let1 v (make-vector (token-table-number-of-values) 0)
          (inc! (ref v type) (sign cnt))
          (hash-table-put! (ref tab 'cache) token v)
          ))
    ))

(define (add-token type token . maybe-count)
  (apply process-token type token + maybe-count))

(define (delete-token type token . maybe-count)
  (apply process-token type token - maybe-count))

(define (token-count type token)
  (let* ((tab (token-table))
         (v   (or (hash-table-get (ref tab 'cache) token #f)
                  (and-let* ((s (dbm-get (ref tab 'db) token #f))
                             (v (unpack-value s)))
                    (hash-table-put! (ref tab 'cache) token v)
                    v))))
    (if v (ref v type) 0)))

(define (message-count-of-type type)
  (ref (ref (token-table) 'message-count) type))

(set! (setter message-count-of-type)
      (lambda (type val)
        (set! (ref (ref (token-table) 'message-count) type) val)))

;; get appropriate table type index
(define (token-table-index-of-spam lang)
  (+ (* (or (find-index (cut eqv? <> lang) (token-table-languages)) 0) 2) 1))

(define (token-table-index-of-nonspam lang)
  (* (or (find-index (cut eqv? <> lang) (token-table-languages)) 0) 2))

;; for backward compatibility.
;; Convert old database to new database
(define (convert-database old-path new-path)
  (define current-table #f)
  (with-input-from-file old-path
    (lambda ()
      (with-token-table
       new-path :create
       (lambda ()
         (port-for-each
          (lambda (expr)
            (if (keyword? (car expr))
              (let1 tab ((if (get-keyword :spam? expr)
                           token-table-index-of-spam
                           token-table-index-of-nonspam)
                         (get-keyword :lang expr))
                (set! (message-count-of-type tab)
                      (get-keyword :message-count expr))
                (set! current-table tab))
              (add-token current-table (car expr) (cdr expr))))
          read)))))
  )

(define (token-table-token-count)
  (let1 result (make-vector (token-table-number-of-values) 0)
    (dbm-for-each
     (ref (token-table) 'db)
     (lambda (k v)
       (let1 v (unpack-value v)
         (unless 
             (string-prefix? (token-table-special-key-prefix) k)
           (dolist (lang (token-table-languages))
             (dolist (table (list token-table-index-of-nonspam
                                  token-table-index-of-spam))
               (if (> (ref v (table lang)) 0)
                 (inc! (ref result (table lang))))))))))
    result))

(define (token-table-message-count)
  (ref (token-table) 'message-count))

(define (token-table-for-each proc)
  (dbm-for-each
   (ref (token-table) 'db)
   (lambda (k v)
     (let1 v (unpack-value v)
       (proc k v)))))


;;==========================================================
;; Collecting statistics
;;

(define word-probability-limit 0.0001)

(define (process-words mail table-getter process-token sign)
  (receive (tokens lang) (tokenize-email mail)
    (unless (null? tokens)
      (let1 tab (table-getter lang)
        (for-each (cut process-token tab <>) tokens)
        (inc! (message-count-of-type tab) (sign 1))))))

(define (token-table-collect-words mail table-getter)
  (process-words mail table-getter add-token +))

(define (token-table-discard-words mail table-getter)
  (process-words mail table-getter delete-token -))



;; Calculate a word's spam probability.
;;  Cf. Paul Graham, "A Plan for Spam" and "Better Bayesian Filtering"
(define (spamness-of-word word lang)
  (load-token-table-if-not-loaded)
  (let* ((gtab (token-table-index-of-nonspam lang))
         (btab (token-table-index-of-spam lang))
         (g (token-count gtab word))
         (b (token-count btab word))
         (min-p word-probability-limit)
         (min-p2 (* min-p 2)))
    (cond ((< (+ (* g 2) b) 5)
           0.4) ; default 'spamness' for unfamiliar word
          ((zero? g)
           (if (> b 10) (- 1 min-p) (- 1 min-p2))) ;see "Better" paper
          ((zero? b)
           (if (> g 10) min-p min-p2)) ;see "Better" paper
          (else
           (clamp (/ (clamp (/ b (message-count-of-type btab)) #f 1.0)
                     (+ (clamp (/ (* g 2) (message-count-of-type gtab)) #f 1.0)
                        (clamp (/ b (message-count-of-type btab)) #f 1.0)))
                  min-p2
                  (- 1 min-p2)))
          )))

;; Given list of probabilities, calculates combined probability
;;  using Bayesian probability.
(define (spam-probability probs)
  (let1 prod (apply * probs)
    (/ prod (+ prod (apply * (map (cut - 1 <>) probs))))))

;; Take NUM most-interesting words from the list of tokens.
(define (most-interesting-words lang tokens num)
  (let ((tab (make-hash-table 'string=?))
        (extremeness (lambda (prob) (abs (- 0.5 prob)))))
    (for-each (lambda (tok)
                (unless (hash-table-get tab tok #f)
                  (hash-table-put! tab tok (spamness-of-word tok lang))))
              tokens)
    (let ((wlist (sort (hash-table-map tab cons)
                       (lambda (a b) (> (extremeness (cdr a))
                                        (extremeness (cdr b)))))))
      (take* wlist num))))

;; Calculate probability of the mail being spam.  Returns three values:
;; the probability, the language of the mail, and the list of 15
;; significant words with each "spamness".
(define (spamness-of-mail mail)
  (load-token-table-if-not-loaded)
  (receive (tokens lang) (tokenize-email mail)
    (let1 words (most-interesting-words lang tokens 15)
      (values (spam-probability (map cdr words)) lang words))))

;; Predicate API
(define (mail-is-spam? mail)
  (load-token-table-if-not-loaded)
  (and (token-table)
       (receive (prob lang words)
           (if (string? mail) ; Accept file name for backward compatibility
             (spamness-of-mail
              (make <mail> :file mail))
             (spamness-of-mail mail))
         (>= prob 0.9))))

;;============================================================
;; Tokenizing message
;;

;; Entry point.  Reads FILE and returns two values; the list of
;; tokens and the language of the message.
;; First it tries to honor the charset in the message body
;; (if no charset is given, try to use "*ja" guessing), and if 
;; it fails, re-try with us-ascii.
;; (Because of retry, we need to accumulate tokens instead of
;; processing them as we read FILE.)
(define (tokenize-email mail)

  (define (try charset)
    (let* ((tokens '())
           (lang (with-input-from-string (ref mail 'content)
                   (cut with-port-locking (current-input-port)
                        (cut tokenize-message (scmail-mail-query mail 'file) 
                             charset
                             (lambda (token) (push! tokens token))))))
           )
      (values (reverse! tokens) lang)))

  (with-error-handler
      (lambda (e)
        (scmail-wformat "error during processing ~a (~a)" 
                        (scmail-mail-query mail 'file) (ref e 'message))
        (values '() #f))
    (lambda ()
      (with-error-handler
          (lambda (e) (try "none"))
        (lambda () (try #f)))))
  )

;; Tokenize one chunk of message (header + body).  If the body is
;; MIME, call tokenize-mime, which in turn calls tokenise-message
;; recursively to deal with each part.  Returns a language tag
;; (for now, either #t or 'ja).
(define (tokenize-message file charset receiver)
  (let* ((headers (rfc822-header->list (current-input-port)))
         (content-type (get-content-type headers))
         (boundary (get-mime-boundary content-type))
         (charset (if (equal? charset "none")
                    "none"
                    (get-charset content-type charset)))
         )
    (tokenize-header headers charset receiver)
    (if boundary
      (tokenize-mime file receiver charset boundary)
      (tokenize-body file receiver charset content-type
                     (get-transfer-encoding headers)))
    ))

;; Tokenize a message header.  Returns void.
;;   NB: we should use tokenize-port-iso8859 in case if the header
;;   contains invalid multibyte character.  Right now, this code fails
;;   when, for example, the mail has some invalid charset name and the
;;   header field contains iso8859-1 character.  However, such an error
;;   is caught by tokenize-email and it rescans the mail by setting
;;   charset to "us-ascii" forcibly, so such an email is correctly
;;   parsed using tokenize-port-iso8859 eventually.
(define (tokenize-header headers charset receiver)
  (for-each (lambda (header)
              (with-input-from-string
                  (scmail-mail-decode-field 
                   (string-join (map x->string (cdr header)) " ")
                   (x->string (gauche-character-encoding)))
                (cute (if (or (equal? "none" charset)
                              (string-ci=? "us-ascii" charset)
                              (string-prefix-ci? "iso-8859" charset))
                        tokenize-port-iso8859
                        tokenize-port)
                      receiver)))
            headers))

;; Tokenize a message body.
;; Returns a language tag, or #f.
(define (tokenize-body file receiver charset 
                       content-type transfer-encoding)
  (define (do-tokenize)
    (cond
     ;; Lots of emails that claims charset=us-ascii actually contains
     ;; iso-8859 characters.
     ((or (equal? "none" charset)
          (string-ci=? "us-ascii" charset)
          (string-prefix-ci? "iso-8859" charset))
      (tokenize-port-iso8859 receiver))
     ;; Deal with multibyte messages.
     ((or (equal? charset "*ja")
          (ces-conversion-supported? charset
                                     (x->string (gauche-character-encoding))))
      (with-input-from-port (open-input-conversion-port
                             (current-input-port) charset)
        (cut tokenize-port receiver)))
     (else
      (scmail-dformat "~a: unknown charset (~a): applying single-byte analysis"
                      file charset)
      (tokenize-port-iso8859 receiver))
     ))
  
  (cond
   ((and content-type
         (not (or (string-prefix-ci? "text/" content-type)
                  (string-prefix-ci? "message/rfc822" content-type))))
    (scmail-dformat "skipping ~a (content-type=~a)" file content-type)
    #f)
   ((and transfer-encoding (string-ci=? transfer-encoding "base64"))
    (with-input-from-string (base64-decode-string (port->string (current-input-port)))
      do-tokenize))
   ((and transfer-encoding (string-ci=? transfer-encoding "quoted-printable"))
    (with-input-from-string (quoted-printable-decode-string (port->string (current-input-port)))
      do-tokenize))
   (else (do-tokenize))
   ))

;; Tokenize MIME message.  Calls tokenize-message for each part.
;; Returns a language tag, or #f.
(define (tokenize-mime file receiver charset boundary)
  ;; skip the toplevel part
  (define (skip-preamble line)
    (cond ((eof-object? line) #f)
          ((equal? line boundary) #t)
          (else (skip-preamble (read-byte-line)))))
  ;; collect parts
  (define (collect-parts parts)
    (let loop ((parts '()))
      (let loop2 ((line (read-byte-line))
                  (lines '()))
        (cond ((eof-object? line)
               (cons (string-join (reverse! lines) "\r\n") parts))
              ((equal? (string-complete->incomplete line)
                       (string-complete->incomplete boundary))
               (loop (cons (string-join (reverse! lines) "\r\n") parts)))
              (else
               (loop2 (read-byte-line) (cons line lines)))))))
  ;; language
  (define lang #t)
  ;; main body
  (and (skip-preamble (read-byte-line))
       (fold (lambda (part number)
               (with-input-from-string part
                 (lambda ()
                   (let1 lg
                       (tokenize-message #`",|file|#,|number|"
                                         charset receiver)
                     (when (symbol? lg) (set! lang lg)))))
               (+ number 1))
             0
             (reverse! (collect-parts '())))
       lang)
  )

;; Tokenizer utilities

(define (flush-token lis receiver)
  (unless (or (null? lis) (null? (cdr lis))
              (every (cut char-set-contains? #[\d_-] <>) lis))
    (receiver (list->string (map char-downcase (reverse! lis))))))

;; japanese delimiting characters
;;   these are used as hints for tokenizer. 
(define char-set:ja-delimiters
  (char-set #\u3002  ;; #\。
            #\u3001  ;; #\、
            #\uff0c  ;; #\，
            #\u30fb  ;; #\・
            #\u300c  ;; #\「
            #\u300d  ;; #\」
            #\u300e  ;; #\『
            #\u300f  ;; #\』
            ))

(define char-set:ja-kanji
  (case (gauche-character-encoding)
    ((utf-8)
     (ucs-range->char-set #x4e00 #x9fef)) ;; CJK unified ideograms
    ((euc-jp sjis)
     (integer-range->char-set (char->integer #\u4e9c) ;; #\亜
                              (char->integer #\u7464) ;; #\瑤
                              ))
    (else #[])))

;; Base tokenizer for (potentially) Japanese message.
(define (tokenize-port receiver)
  (define alnum-count 0)
  (define mb-count 0)
  
  (define (alnum? ch) (char-set-contains? #[-$'!\w] ch))
  (define (mb? ch) (>= (char->integer ch) 256))
  (define (kanji? ch) (char-set-contains? char-set:ja-kanji ch))

  (define (ascii ch acc)
    (inc! alnum-count)
    (cond ((eof-object? ch) (flush-token acc receiver))
          ((mb? ch)
           (flush-token acc receiver)
           (multibyte (read-char) ch))
          ((alnum? ch) (ascii (read-char) (cons ch acc)))
          (else 
           (flush-token acc receiver)
           (blank (read-char)))))
  (define (blank ch)
    (cond ((eof-object? ch))
          ((mb? ch) (multibyte (read-char) ch))
          ((alnum? ch) (ascii (read-char) (list ch)))
          (else (blank (read-char)))))
  (define (multibyte ch prev)
    (inc! mb-count)
    (cond ((eof-object? ch))
          ((or (eqv? ch #\return) (eqv? ch #\newline))
           (multibyte (read-char) prev)) ;; ignore newlines between mb char
          ((mb? ch)
           (cond ((char-set-contains? char-set:ja-delimiters ch)
                  (multibyte (read-char) #f))
                 (prev
                  (unless (and (not (kanji? prev))
                               (kanji? ch))
                    (flush-token (list ch prev) receiver))
                  (multibyte (read-char) ch))
                 (else
                  (multibyte (read-char) ch))))
          ((alnum? ch) (ascii (read-char) (list ch)))
          (else (blank (read-char)))))
  (ascii (read-char) '())
  ;; We use heuristics to determine if the message is Japanese or not.
  ;; If more than 20% of its characters are multibyte, we assume it's a
  ;; japanese message.
  (if (> (* mb-count 4) alnum-count) 'ja #t))

;; Base tokenizer for ASCII message.
(define (tokenize-port-iso8859 receiver)
  (define (alnum? b) (char-set-contains? #[-$'!\w] (integer->char b)))
  (define (ascii b acc)
    (cond ((eof-object? b) (flush-token acc receiver))
          ((alnum? b) (ascii (read-byte) (cons (integer->char b) acc)))
          (else (flush-token acc receiver)
                (blank (read-byte)))))
  (define (blank b)
    (cond ((eof-object? b))
          ((alnum? b) (ascii (read-byte) (list (integer->char b))))
          (else (blank (read-byte)))))
  (ascii (read-byte) '())
  #t)

;; helper function to get a line as a byte string
(define (read-byte-line . maybe-port)
  (read-line (get-optional maybe-port (current-input-port)) #t))

(define (get-content-type headers)
  (rfc822-header-ref headers "content-type"))

(define (get-transfer-encoding headers)
  (rfc822-header-ref headers "content-transfer-encoding"))

(define (get-mime-boundary content-type)
  (cond ((and content-type (not (string-incomplete? content-type))
              (#/multipart\/\w+\;.*boundary=\s*\"?([-+'(),.\/:=?\w ]+)\"?/i
               content-type))
         => (lambda (m) (string-append "--" (m 1))))
        (else #f)))

(define (get-charset content-type default)
  (cond ((and content-type (not (string-incomplete? content-type))
              (#/charset=\"?([-_\w.]+)\"?/ content-type))
         => (cut <> 1))
        (default)
        (else "*ja"))) ;;guess
   
(provide "bayesian-filter")
