;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Decision Tree learning system. ;;; Written by Shaul Markovitch, CS department, Technion. ;;; ;;; To try the system, perform: ;;; (load-data "examples-data-file") ;;; (learning-experiment) ;;; Where "examples-data-file" should be a name of a data file that ;;; contains classified examples, each example on a separate line. ;;; An example is a list of attribute values seperated by a comma or ;;; a by a space. ;;; The class is the last item in the line. (But can be the first by ;;; setting a parameter). ;;; The result is the average accuracy obtained by performing ;;; K-fold cross-validation learning experiment. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Change-log ;;; ;;; Version 0.03 [31/10/99] ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; class-tree ;;; A structure that holds information of a decision tree node ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defstruct class-tree test ;A test for being member in this subtree attribute ;Attribute used for splitting children default ;Default classification for members of the ;subtree children ;Subtrees of the node (label "=") ;Used for drawing trees ) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; attribute ;;; A structure that holds information about an attribute ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defstruct attribute name ;A name for the attribute index ;The attribute index. Used for retrieving ; its value in an example. type ; Nominal or numeric domain ; The list of all values observed for the ; attribute. (unknowns 0) ; The number of examples having unknown value ; for the attribute min ; The minimal value observed for the attribute ; (for numeric attribute only) max ; Maximal value for the attribute. default ; Default value for the attribute. ) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; The following three variables will be assigned by the procedure that ;;; reads data files. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defvar *examples* nil "a list of examples. Each example is a pair: a list of values and a class") (defvar *attributes* nil "a list of attributes. Each attribute is a structure.") (defvar *classes* '(n p) "a list of classes. The default is p and n.") (defvar *trace* nil "When non-nil, the learning algorithm will report progress") ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defparameter *minimum-gain* 0.0001 "The tree building will not progress when gain is below.") ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ID3 ;;; The basic classification algorithm. Gets examples and returns a ;;; tree. No windows. Test is used only for recursive calls - it ;;; contains the value that was used for getting the subtree. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun id3 (examples &optional (attributes *attributes*) (test nil)(classes *classes*)) (cond ((null examples) nil) (t (let* ((class-count (count-items examples :key #'example-classification :values classes))) ;; Class-count returns a list of the number of ;; appearances of each class (can work for more than two ;; classes). Each element is a pair - the name of the ;; class and the number of appearances. (cond ((zerop (second (second class-count))) ;one class (make-class-tree :default (first (first class-count)) :test test)) (t (create-subtree examples attributes classes (most-frequent class-count) test))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; most-frequent ;;; Takes a class count (a list of pairs ) ;;; Returns the class with most appearances (selects randomly if several) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun most-frequent (count) (let ((most-frequent (loop for c in count while (= (second c) (second (first count))) collect (first c)))) (random-selection most-frequent))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; create-subtree ;;; The main learning procedure. Computes the gain for each ;;; attribute. Creates a subtree for the attribute with maximal gain. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun create-subtree (examples attributes classes default test) (let ((max-gain most-negative-fixnum) new-gain new-examples max-att max-examples children max-atts max-examples-s) (when *trace* (format t "~%--------------------------------") (format t "~%N-Examples: ~A. Choosing from : ~A ~%" (length examples) (mapcar #'attribute-name attributes))) (loop for a in attributes do ;; The gain function returns the gain as the major value. ;; It also returns a second value: a list of pairs, each ;; pair consists of the value used for a child and the ;; examples that will be moved down to that child. (multiple-value-setq (new-gain new-examples)(gain a examples classes)) (when *trace* (format t "~% Attribute: ~A. Gain: ~A" (attribute-name a) new-gain)) (cond ((> new-gain max-gain) (setq max-gain new-gain max-atts (list a) max-examples-s (list new-examples))) ;; When several attributes have equal maximal gain, we ;; would like to select from them randomly. ((= new-gain max-gain) (push a max-atts) (push new-examples max-examples-s)))) ;;The condition below allow us to stop processing when too little ;;gain is achieved. (when (< max-gain *minimum-gain*)(return-from create-subtree (make-class-tree :default default :test test))) (let ((n (random (length max-atts)))) (setq max-att (elt max-atts n) max-examples (elt max-examples-s n))) (when *trace* (format t "~%Selected attribute: ~A~%" (attribute-name max-att))) ;; Create a subtree for each child that has at least one example (setf children (loop for example-set in max-examples when (second example-set) collect (id3 (second example-set) (if (eql (attribute-type max-att) 'numeric) attributes (remove max-att attributes)) (first example-set)))) ;; The label is used only for drawing trees. (when (eql (attribute-type max-att) 'numeric) (setf (class-tree-label (first children)) "<=") (setf (class-tree-label (second children)) ">")) (make-class-tree :default default :test test :attribute max-att :children children ))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; gain ;;; Compute the gain for an attribute. Computes the information ;;; (uncertainty) for the current node and the average uncertainty of ;;; the children and subtract them. Has different handling of numeric ;;; and nominal attributes. Returns two values. The first is the gain ;;; itself. The second is the partition of the examples to the ;;; children. The partition is a list of pair. Each pair consisting ;;; of the value used for the child and the list of examples belong to ;;; the child. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun gain (att examples classes) (let ((current-info (IPN (mapcar #'second (count-items examples :key #'example-classification :values classes))))) (multiple-value-bind (children-info children-examples) (if (eql (attribute-type att) 'numeric) (numeric-uncertainty att examples classes) (nominal-uncertainty att examples classes)) (values (- current-info children-info) children-examples)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; nominal-uncertainty ;;; This is the simple uncertainty calculation for attribute with ;;; nominal values. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun nominal-uncertainty (att examples classes) (let ((partition (collect-items examples (attribute-domain att) :key #'(lambda (e) (get-value (example-values e) att)))) (n (length examples))) (values (loop for p in partition for info = (compute-info (second p) classes) sum (* (/ (length (second p)) n) info)) partition))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;compute-info ;;;Computes the uncertainty of a set of examples. counts how many ;;;members of each class are in the set, and computes the information. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun compute-info (examples classes) (let ((count (count-items examples :key #'example-classification :values classes))) (IPN (mapcar #'second count)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; numeric-uncertainty ;;; Computes the lowest uncertainty of two children created by a ;;; binary numerical test. It sorts the examples and maintains two ;;; lists. Its keeps on moving examples from the right list to the ;;; left list. For each partition it computes the uncertainty. It ;;; returns the value that obtains the lowest uncertainty (and the ;;; partition associated with that value). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun numeric-uncertainty (att examples classes) (let* ((sorted-examples (sort (copy-list examples) #'< :key #'(lambda (e)(get-value (example-values e) att)))) (left-count (count-items nil :key #'example-classification :values classes)) (right-count (count-items examples :key #'example-classification :values classes)) (min-uncertainty (IPN (mapcar #'second right-count))) (best-left examples)(best-right nil)) (loop with left = nil with left-n = 0 with right = sorted-examples with right-n = (length right) with n = right-n while right for c = (example-classification (first right)) do (push (pop right) left) (decf (second (assoc c right-count))) (incf (second (assoc c left-count))) (decf right-n) (incf left-n) (when (and left right (not (= (get-value (example-values (first left)) att) (get-value (example-values (first right)) att)))) (let ((child-uncertainty (+ (* (/ left-n n)(IPN (mapcar #'second left-count))) (* (/ right-n n)(IPN (mapcar #'second right-count)))))) (when (< child-uncertainty min-uncertainty) (setq min-uncertainty child-uncertainty best-left left best-right right))))) (let ((sep (if (and best-left best-right) (* 0.5 (+ (get-value (example-values (first best-left)) att) (get-value (example-values (first best-right)) att))) (get-value (example-values (first examples)) att) ))) (values min-uncertainty (list (list sep best-left)(list sep best-right)))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; IPN ;;; Sigma [- (* P log P)] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun IPN (numbers) (let ((total (apply #'+ numbers))) (cond ((zerop total) 0) (t (loop for n in numbers sum (- (* (/ n total)(my-log (/ n total) 2)))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; my-log ;;; Modify log definition : log(0)=0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun my-log (r b) (if (zerop r) 0 (log r b))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ID3-CLASSIFY ;;; The classification function. Takes a tree and an example and ;;; moves the example down the tree. Returns the class. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun id3-classify (example tree) (let ((children (class-tree-children tree))) (cond ((null children)(class-tree-default tree)) (t (let* ((value (get-value example (class-tree-attribute tree)))) (cond ((eql (attribute-type (class-tree-attribute tree)) 'numeric) (id3-classify example (if (<= value (class-tree-test (first children))) (first children)(second children)))) (t (let ((child (find value children :key #'class-tree-test :test #'equalp))) (if child (id3-classify example child) (class-tree-default tree)))))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Access and utility functions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun example-values (example) (car example)) (defun example-classification (example) (second example)) (defun get-value (example attribute) (let ((val (elt example (1- (attribute-index attribute))))) (if (eql val '?) (attribute-default attribute) val))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; count-items ;;; Returns a count of items in a list. For example, the list (a b c ;;; a b c b) will yield the output ((b 3)(a 2)(c 2)). The "key" ;;; parameter can be used to access a specific part of the elements. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun count-items (list &key (key #'identity) (values nil)) (let (counting-list) (when values (setq counting-list (loop for v in values collect (list v 0)))) (loop for item in list for v = (funcall key item) for entry = (assoc v counting-list) do (cond ((null entry) (when (null values) (push (list v 1) counting-list))) (t (incf (second entry))))) (sort counting-list #'> :key #'second))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; collect-items ;;; Like count-item but collect the actual items. ;;; (collect-items '((2 b)(3 a)(2 a)(7 b)) '(a b c d) :key #'second) ;;; will return: ((a ((2 a)(3 a)))(b ((2 b)(7 b)))(c nil)(d nil)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun collect-items (list values &key (key #'identity) ) (let ((collecting-list (mapcar #'(lambda (v)(list v nil)) values))) (loop for el in list for v = (funcall key el) for entry = (assoc v collecting-list) when entry do (push el (second entry))) collecting-list)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; number-of-leaves ;;; Returns the number of leaves in a tree ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun number-of-leaves (tree) (cond ((null (class-tree-children tree)) 1) (t (loop for child in (class-tree-children tree) sum (number-of-leaves child))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Experimentation functions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defparameter *classification-function* #'id3-classify) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; test-classifier ;;; Gets a test set and a classifier (a tree) and returns a number ;;; between 0 and 1 - the accuracy of the classifier on the test set. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun test-classifier (test-set classifier &optional (classification-function *classification-function*)) (float (* 100 (/ (loop for e in test-set for i from 1 count (equalp (example-classification e) (if classifier (funcall classification-function (example-values e) classifier) (random-selection *classes*)))) (length test-set))))) (defun noisy-examples (examples) (loop for tail on examples for alt = (assoc (first (first tail))(rest tail) :test #'equal) when (and alt (not (equal (second (first tail))(second alt)))) collect (list (first tail) alt))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; classification-experiment ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defparameter *learning-algorithm* #'id3) (defparameter *n-fold* 10) (defun learning-experiment (&key (examples *examples*) (learning-algorithm *learning-algorithm*)) (setf *random-state* (make-random-state t)) (let ((partition (random-partition examples *n-fold*))) (average (loop for testing-set in partition for training-set = (apply #'append (remove testing-set partition)) collect (test-classifier testing-set (funcall learning-algorithm training-set)))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Input handling ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; load-data ;;; Assumes a data file where each example occupies one line. The ;;; values are separated by commas or by spaces. The class is the ;;; last (default) or first item. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun load-data (file-name &optional (class-first? nil)) (let ((raw-examples (read-data-file file-name))) (setf *examples* (create-examples raw-examples class-first?)) (setf *classes* (collect-classes *examples*)) (setf *attributes* (analyze-attributes *examples*)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; store-data-in-lisp-file ;;; This function writes the 3 variables *classes*, *attributes* and ;;; *examples* into a lisp file in a loadable form. The file can then ;;; be loaded using the regular "load" function. It has two purposes: ;;; 1. Loading will be much faster. This is important for large data ;;; files. ;;; 2. The user can use the lisp file for changing names of attributes ;;; (since the automatic data loader give generated names for the ;;; attributes. ;;; The function should be called AFTER the data was loaded and ;;; analyzed using the function load-data. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun store-data-in-lisp-file (file-name) (with-open-file (f file-name :if-exists :supersede :if-does-not-exist :create :direction :output) (format f "~%~:W~% ~%~:W~% ~%~:W~%" `(setq *classes* ',*classes*) `(setq *attributes* ',*attributes*) `(setq *examples* ',*examples*)) f)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; read-data-file ;;; reads a data file to a list of lists. Each line is an a list. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun read-data-file (file-name) (set-syntax-from-char #\, #\space) (let* ((data (with-open-file (f file-name :direction :input) (let (token) (loop while (setq token (read-line f nil nil)) collect token)))) (examples (loop for line in data for example = (with-input-from-string (string line) (loop for item = (read string nil nil) while item collect item)) when example collect example))) (set-syntax-from-char #\, #\,) examples)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; create-examples ;;; Turns the list into a list of examples. The class is by default ;;; the last element in the list, but can also be specified as the ;;; first. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defparameter *filter-uknown* nil) (defun create-examples (input-lists &optional (class-first? nil)) (loop for e in input-lists for example = (if class-first? (list (rest e)(first e)) (list (butlast e) (first (last e)))) when (or (null *filter-uknown*) (loop for v in (example-values example) never (eql v '?))) collect example)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; collect-classes ;;; Takes a list of examples and collects a list of all the classes ;;; appearing in the list. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun collect-classes (examples) (let (classes) (loop for e in examples do (pushnew (example-classification e) classes)) classes)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; analyze-attributes ;;; The major data analysis tool. Takes a list of example, and ;;; analyzes each of the attributes. The most important decision is ;;; whether an attribute is to be considered numeric or nominal. It ;;; will be considered numeric if two conditions are satisfied: ;;; 1. The values are numbers. ;;; 2. There are at least *min-values-in-numeric* different values. ;;; The function also computes default values for the attributes. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defparameter *min-values-in-numeric* 10 "The minimum domain size to be considered as numeric.") (defun analyze-attributes (examples) (let ( (attributes (loop repeat (length (example-values (first examples))) collect (make-attribute)))) (loop for e in examples do (loop for v in (example-values e) for a in attributes for i from 0 do (cond ((eql v '?)(incf (attribute-unknowns a))) (t (pushnew v (attribute-domain a)))))) (loop for a in attributes for i from 1 do (setf (attribute-name a)(format nil "att~d" i)) (setf (attribute-index a) i) (cond ((and (numberp (first (attribute-domain a))) (>= (length (attribute-domain a)) *min-values-in-numeric*)) (setf (attribute-domain a) (sort (attribute-domain a) #'<)) (setf (attribute-type a) 'numeric) (setf (attribute-min a)(first (attribute-domain a))) (setf (attribute-max a)(first (last (attribute-domain a)))) (setf (attribute-default a)(average (attribute-domain a)))) (t (setf (attribute-type a) 'nominal) (setf (attribute-default a) (most-frequent (count-items examples :values (attribute-domain a) :key #'(lambda (e) (get-value (example-values e) a) ))))))) attributes )) #| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Tree display functions ;;; ;;; The following functions are used in combination with the PSGRAPH ;;; package to create a graphical representation of the decision tree ;;; and either view it on the screen using postscript viewer or send ;;; it to a postscript printer ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (when (null (find-package 'psgraph))(load "psgraph")) (defun psgraph-children-func (class-node)(class-tree-children class-node)) (defun psgraph-info-func (class-node) (let ((test-label (if (class-tree-test class-node) (format nil "~a ~a" (class-tree-label class-node) (class-tree-test class-node)) ""))) (list (format nil "~a [~a] ~a" test-label (class-tree-default class-node) (if (class-tree-attribute class-node) (attribute-name (class-tree-attribute class-node)) ""))))) (defun draw-tree (tree &optional (shrink t)) (with-open-file (*standard-output* "tree.ps" :direction :output :if-exists :supersede) (psgraph:psgraph tree #'psgraph-children-func #'psgraph-info-func shrink nil #'eq))) (defun print-tree (tree) (draw-tree tree) (run-program "lpr" "tree.ps")) (defun view-tree (tree) (draw-tree tree) (run-program "ghostview" "tree.ps")) |# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; random-partition ;;; ;;; Takes a set and a number n and partitions the set to n [disjoint] ;;; partions. If the resulting sets will be too small, then a smaller ;;; number will be used. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defparameter *minimal-portion-size* 30) (defun random-partition (data-set n &optional (minimal-size *minimal-portion-size*)) (let* ( (set-size (length data-set)) (n-subsets (min n (truncate (/ set-size minimal-size)))) (min-subset-size (floor (/ set-size n-subsets))) (subsets (make-list n-subsets :initial-element nil)) (full-subsets nil)) (loop while subsets for el = (pop data-set) for index = (random (length subsets)) for subset = (elt subsets index) for new-subset = (cons el subset) do (cond ((>= (length new-subset) min-subset-size) (setf subsets (delete subset subsets :start index :end (1+ index))) (push new-subset full-subsets)) (t (setf (elt subsets index) new-subset))) ) (loop for d in data-set for index from 0 do (push d (elt full-subsets index))) full-subsets)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; random-subset ;;; ;;; Gets a set of elements and a second argument that can be an ;;; integer greater or equal to one or a float between 0 and 1. If it ;;; is 1 or larger, then it is the size of the random subset. If it ;;; it smaller than 1 then it is the portion of the set that will be ;;; the random subset. The function returns the random subset. As a ;;; second value it returns the difference set (i.e. set minus the ;;; subset) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun random-subset (set portion) (let* ((set-copy (copy-list set)) (subset-size (if (>= portion 1) (truncate portion) (truncate (* (length set) portion)))) (new-set (loop repeat subset-size for random-index = (random (length set-copy)) for element = (elt set-copy random-index) do (setq set-copy (delete element set-copy)) collect element))) (values new-set set-copy))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; random-selection ;;; ;;; returns a random element of a set ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun random-selection (set)(elt set (random (length set)))) (defun average (s &key (key #'identity)) (let* ((n (length s)) (mean (float (/ (loop for i in s sum (funcall key i))(length s)))) (var (float (/ (loop for i in s sum (expt (- (funcall key i) mean) 2)) n))) (std (float (sqrt var)))) (values mean std))) ;var is rarely needed