/* Copyright 2010 Aaron J. Radke Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package cc.drx object Fuzzy{ implicit object ParsableFuzzy extends Parsable[Fuzzy]{ def apply(pat:String):Fuzzy = Fuzzy(pat) } } // TODO implement and remove fuzzy specific logic from the collection additions case class Fuzzy(pat:String){ // def compare(a:String, b:String) = TODO // collection operators private def defaultAsString[A](x:A):String = x.toString /**filter list where some match exists*/ def filter[A](xs:Seq[A]):Seq[A] = filter(xs, defaultAsString) //show only matches def filter[A](xs:Seq[A],asString:A=>String):Seq[A] = xs.filter{x => penalty(asString(x)).isDefined} //show only matches /**sort list from best to worst match (lowest penalty)*/ // def sort[A](xs:Seq[A]):Seq[A] = sort(xs, defaultAsString) def sort[A](xs:Seq[A],asString:A=>String):Seq[A] = zipWithPenalty(xs,asString).sortBy(_._2).map(_._1) //sort the matches /**find the best match out of the bunch (lowest penalty) */ // def find[A](xs:Seq[A]):Option[A] = zipWithPenalty(xs).getNonEmpty.map{_.minBy(_._2)._1} //get the best/first match if found def find[A](xs:Seq[A]):Option[A] = find(xs, defaultAsString) def find[A](xs:Seq[A], asString:A=>String):Option[A] = zipWithPenalty(xs,asString).getNonEmpty.map{_.minBy(_._2)._1} //get the best/first match if found private def zipWithPenalty[A](xs:Seq[A],asString:A=>String):Seq[(A,Int)] = for(x <- xs; z <- penalty(asString(x))) yield (x, z) /** integer penalty, where a zero is a perfect match, None means no match at all*/ private def penalty(str:String):Option[Int] = { //think about a more generic interface than an integer penalty that may change //TODO move the pat dependent calcs as private lazy val's local the Fuzzy case class to cache calcs for future penalty lazy val lStr = str.toLowerCase lazy val lPat = pat.toLowerCase lazy val pats = Parse.split[String](pat) lazy val lPats = pats.map{_.toLowerCase} lazy val terms = pat.splitTerms lazy val lTerms = terms.map{_.toLowerCase} lazy val soundexStr = Soundex(str) lazy val soundexTerms = terms map Soundex.apply lazy val beacon:String = str.toList.sliding(2,1).filter{ case List(a,b) => (a.isLower && b.isUpper) || (!a.isLetter && b.isLetter) || b.isDigit case _ => true }.flatMap(_.lastOption).mkString(str.take(1),"","") //instead of headOption this works: ("".take(1) == "") lazy val lBeacon = beacon.toLowerCase // Log(str, beacon) lazy val N = str.size lazy val n = pat.size lazy val Nb = beacon.size lazy val np = pats.map{_.size}.sum lazy val deltaNPenalty:Int = (N-n).abs.sat(0,99) //0 same size, 99 lots of extra chars penalty step size // val useCase = pat exists (_.isUpperCase) //vim smartcase like auto mode setting // note: case matching is baked in with priority penalty //-- cost function search //smaller numbers are a better match, zero is perfect, (prime numbers and relative size matching confuse separation, just use simple priority based thinking), this prevents the need to penalty by all and use best //--empty string val res = { if(str == "") Int.MaxValue //--full match case else if(str == pat) 0 //zero is perfect match //--full match case insensitive else if(lStr == lPat) 100 //--partial match case else if(str contains pat) 200 //TODO differentiate degree of partial match //--partial match case insensitive else if(lStr contains lPat) 300 //--match all list of pats Case else if(pats forall (str contains _)) 400 //--match all list of pats NoCase else if(lPats forall (lStr contains _)) 500 //--match all list of terms Case else if(terms forall (str contains _)) 600 //--match all list of terms NoCase else if(lTerms forall (lStr contains _)) 700 //--match beacon with Case else if(beacon contains pat) 800 //--match beacon NoCase else if(lBeacon contains lPat) 900 //--match beacon elements subsequences else if(beacon containsSubsequence pat) 1000 //--match beacon elements subsequences else if(lBeacon containsSubsequence lPat) 1100 //--match with subsequence else if(str containsSubsequence pat) 1200 //--match with subsequence NoCase else if(lStr containsSubsequence lPat) 1300 //--match all list of terms by simularity //--match all terms by a soundex simularity else if(soundexTerms forall (soundexStr contains _)) 1400 //--match with spaces //--TODO allow skips in beacon matches //--TODO allow beacon like matches as part of the pattern lists else Int.MaxValue } if(res == Int.MaxValue) None else Some(res + deltaNPenalty) } }