#!/usr/bin/ruby1.8 require 'GoogleSearchCache' require 'Config' include GoogleSearch config = CompLearn::readConfig() @g = GoogleSearchCache.new(config) def searchPair(term1, term2) @g.getPageCount([term1, term2]) end def searchSingle(term) @g.getPageCount([term]) end def computeLogProb(num) Math.log(num.to_f / 8000000000.0) / Math.log(2.0) end def doFirstGraph mainword = "green" maxnum = 400 todo = [ ] maxnum.times { |i| todo << i } [ 813, 751, 1853, 2831, 5123, 7188, 11851, 23457, 45474, 68157, 193813, 245138, 356852, 621841, 1895867, 485763].each { |i| todo << i } [1000,2000,4000,8000,16000,32000,64000,128000,256000,512000,1000000,2000000,5000000,250000,125000,10000, 100000, 9999,99999,999999 ].each { |i| todo << i-1 } todo.each { |i| num = i + 1 pagecount = searchPair(mainword, num.to_s) logPagecount = Math.log(pagecount) / Math.log(2) lognum = Math.log(num) / Math.log(2) estkolmnum = 2 * lognum + 1 puts "#{num} #{lognum} #{pagecount} #{logPagecount} #{estkolmnum}" } end def makeSpanishWord(num) fail if num < 0; return "cero uno dos tres cuatro cinco seis siete ocho nueve diez".split()[num] if num < 10 return "once doce trece catorce quince dieciseis diecisiete dieciocho diecinueve veinte".split()[num-11] if num < 21 return "veinte treinta cuarenta cincuenta sesenta setenta ochenta noventa".split()[(num/10)-2]+(num%10==0?'':' y '+makeSpanishWord(num%10)) if num < 100 return "cien y #{makeSpanishWord(num-100)}" if num < 200 return "#{makeSpanishWord(num/100)}cientos y #{makeSpanishWord(num%100)}" if num < 1000 fail # return "#{\qw twenty thirty forty fifty sixty seventy return "unknown" end def makeEnglishWord(num) fail if num < 0; return "zero one two three four five six seven eight nine".split()[num] if num < 10 return "eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty".split()[num-11] if num < 21 return "twenty thirty forty fifty sixty seventy eighty ninety".split()[(num/10)-2]+(num%10==0?'':' '+makeEnglishWord(num%10)) if num < 100 return makeEnglishWord(num/100) + " hundred" + (num%100 == 0 ? '' : ' '+makeEnglishWord(num%100)) if num < 1000 fail # return "#{\qw twenty thirty forty fifty sixty seventy return "unknown" end def doSecondGraph maxnum = 500 maxnum.times { |i| j = i + 1 pgenglish = searchSingle(makeEnglishWord(j).gsub(/ /, '-')) logpgenglish = Math.log(pgenglish) / Math.log(2) logj = Math.log(j) / Math.log(2) puts "#{j} #{logj} #{pgenglish} #{logpgenglish}" } end def calcSum(a) acc = 0 a.each { |i| acc = acc + i } acc end def calcS(a,b) sumprod, suma, sumb = 0,calcSum(a), calcSum(b) a.each_index { |i| sumprod = sumprod + a[i] * b[i] } sumprod - suma * sumb / a.size end def calcCor(a, b) calcS(a,b) / Math.sqrt(calcS(a,a) * calcS(b,b)) end def doThirdGraph datafile = File.open("nums.dat", "wb") maxnum = 120 spgenglish = [ ] spgdigits = [ ] spgGreen = [ ] spgSpanish = [ ] maxnum.times { |i| j = i + 1 pgenglish = searchSingle(makeEnglishWord(j).gsub(/ /, '-')) pgspanish = searchSingle(makeSpanishWord(j).gsub(/ /, '-')) # puts "#{j} in Spanish is #{makeSpanishWord(j).gsub(/ /, '-')}" # puts "#{j} in English is #{makeEnglishWord(j).gsub(/ /, '-')}" pgdigits = searchSingle(j.to_s) pgGreen = searchPair("green", j.to_s) spgenglish << pgenglish spgdigits << pgdigits spgGreen << pgGreen spgSpanish << pgspanish logpgenglish = computeLogProb(pgenglish) logpgdigits = computeLogProb(pgdigits) logpgGreen = computeLogProb(pgGreen) logpgSpanish = computeLogProb(pgspanish) #logj = Math.log(j) / Math.log(2) datafile.puts "#{j} #{pgenglish} #{logpgenglish} #{pgdigits} #{logpgdigits} #{pgGreen} #{logpgGreen} #{pgspanish} #{logpgSpanish}" } datafile.close corED = calcCor(spgenglish, spgdigits) corEG = calcCor(spgenglish, spgGreen) corDG = calcCor(spgdigits, spgGreen) corES = calcCor(spgenglish, spgSpanish) corDS = calcCor(spgdigits, spgSpanish) puts "# Correlation between English,Digits: #{corED}" puts "# Correlation between English,GreenDig: #{corEG}" puts "# Correlation between Digits,GreenDig: #{corDG}" puts "# Correlation between English,Spanish: #{corES}" puts "# Correlation between Digits,Spanish: #{corDS}" gnuplotcmd = File.open("gnuplotnums.cmd", "wb") gnuplotcmd.write <