#!/usr/bin/ruby1.8
require 'GoogleSearchCache'
require 'Config'
include GoogleSearch

config = CompLearn::readConfig()
@g = GoogleSearchCache.new(config)

def searchPair(term1, term2)
	@g.getPageCount([term1, term2])
end

def searchSingle(term)
	@g.getPageCount([term])
end

def computeLogProb(num)
  Math.log(num.to_f / 8000000000.0) / Math.log(2.0)
end

def doFirstGraph
mainword = "green"
maxnum = 400
todo = [ ]
maxnum.times { |i| todo << i }
[ 813, 751, 1853, 2831, 5123, 7188, 11851, 23457, 45474, 68157, 193813, 245138, 356852, 621841, 1895867, 485763].each { |i| todo << i }

[1000,2000,4000,8000,16000,32000,64000,128000,256000,512000,1000000,2000000,5000000,250000,125000,10000, 100000, 9999,99999,999999 ].each { |i| todo << i-1 }
todo.each { |i|
  num = i + 1
  pagecount = searchPair(mainword, num.to_s)
  logPagecount = Math.log(pagecount) / Math.log(2)
  lognum = Math.log(num) / Math.log(2)
  estkolmnum = 2 * lognum + 1
  puts "#{num} #{lognum} #{pagecount} #{logPagecount} #{estkolmnum}"
}
end

def makeSpanishWord(num)
  fail if num < 0;
  return "cero uno dos tres cuatro cinco seis siete ocho nueve diez".split()[num] if num < 10
  return "once doce trece catorce quince dieciseis diecisiete dieciocho diecinueve veinte".split()[num-11] if num < 21
  return "veinte treinta cuarenta cincuenta sesenta setenta ochenta noventa".split()[(num/10)-2]+(num%10==0?'':' y '+makeSpanishWord(num%10)) if num < 100
  return "cien y #{makeSpanishWord(num-100)}" if num < 200
  return "#{makeSpanishWord(num/100)}cientos y #{makeSpanishWord(num%100)}" if num < 1000
  fail
#  return "#{\qw twenty thirty forty fifty sixty seventy
  return "unknown"
end

def makeEnglishWord(num)
  fail if num < 0;
  return "zero one two three four five six seven eight nine".split()[num] if num < 10
  return "eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty".split()[num-11] if num < 21
  return "twenty thirty forty fifty sixty seventy eighty ninety".split()[(num/10)-2]+(num%10==0?'':' '+makeEnglishWord(num%10)) if num < 100
  return makeEnglishWord(num/100) + " hundred" + (num%100 == 0 ? '' : ' '+makeEnglishWord(num%100)) if num < 1000
  fail
#  return "#{\qw twenty thirty forty fifty sixty seventy
  return "unknown"
end

def doSecondGraph
  maxnum = 500
  maxnum.times { |i|
    j = i + 1
    pgenglish = searchSingle(makeEnglishWord(j).gsub(/ /, '-'))
    logpgenglish = Math.log(pgenglish) / Math.log(2)
    logj = Math.log(j) / Math.log(2)
    puts "#{j} #{logj} #{pgenglish} #{logpgenglish}"
  }
end

def calcSum(a)
  acc = 0
  a.each { |i| acc = acc + i }
  acc
end

def calcS(a,b)
  sumprod, suma, sumb = 0,calcSum(a), calcSum(b)
  a.each_index { |i|
    sumprod = sumprod + a[i] * b[i]
  }
  sumprod - suma * sumb / a.size
end

def calcCor(a, b)
  calcS(a,b) / Math.sqrt(calcS(a,a) * calcS(b,b))
end

def doThirdGraph
  datafile = File.open("nums.dat", "wb")
  maxnum = 120
  spgenglish = [ ]
  spgdigits = [ ]
  spgGreen = [ ]
  spgSpanish = [ ]
  maxnum.times { |i|
    j = i + 1
    pgenglish = searchSingle(makeEnglishWord(j).gsub(/ /, '-'))
    pgspanish = searchSingle(makeSpanishWord(j).gsub(/ /, '-'))
#    puts "#{j} in Spanish is #{makeSpanishWord(j).gsub(/ /, '-')}"
#    puts "#{j} in English is #{makeEnglishWord(j).gsub(/ /, '-')}"
    pgdigits = searchSingle(j.to_s)
    pgGreen = searchPair("green", j.to_s)
    spgenglish << pgenglish
    spgdigits << pgdigits
    spgGreen << pgGreen
    spgSpanish << pgspanish
    logpgenglish = computeLogProb(pgenglish)
    logpgdigits = computeLogProb(pgdigits)
    logpgGreen = computeLogProb(pgGreen)
    logpgSpanish = computeLogProb(pgspanish)
    #logj = Math.log(j) / Math.log(2)
    datafile.puts "#{j} #{pgenglish} #{logpgenglish} #{pgdigits} #{logpgdigits} #{pgGreen} #{logpgGreen} #{pgspanish} #{logpgSpanish}"
  }
  datafile.close
  corED = calcCor(spgenglish, spgdigits)
  corEG = calcCor(spgenglish, spgGreen)
  corDG = calcCor(spgdigits, spgGreen)
  corES = calcCor(spgenglish, spgSpanish)
  corDS = calcCor(spgdigits, spgSpanish)
  puts "# Correlation between English,Digits: #{corED}"
  puts "# Correlation between English,GreenDig: #{corEG}"
  puts "# Correlation between Digits,GreenDig: #{corDG}"
  puts "# Correlation between English,Spanish: #{corES}"
  puts "# Correlation between Digits,Spanish: #{corDS}"
  gnuplotcmd = File.open("gnuplotnums.cmd", "wb")
  gnuplotcmd.write <<EOF
set term postscript
set size 0.7,0.7
set xlabel "number"
set ylabel "log probability"
plot "nums.dat" using 1:3 title 'spelled numbers' with points pt 1, "nums.dat" using 1:5 title '1-120' with points pt 2, "nums.dat" using 1:7 title 'green 1-120' with points pt 4, "nums.dat" using 1:9 title 'spanish' with points pt 5
EOF
  gnuplotcmd.close
end

doThirdGraph()
