-- The queries mentioned in -- http://www.mysqlperformanceblog.com/2009/09/29/quick-comparison-of-myisam-infobright-and-monetdb/ -- http://www.mysqlperformanceblog.com/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/comment-page-1/#comment-686446 -- http://www.mysqlperformanceblog.com/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/ -- Platform Intel(R) Quad Core CPU Q6600 @ 2.40GHz with 8GB RAM and ample disk space (2x 500 GB SATA disk @ 7200 RPM as SW-RAID-0), running Fedora 12 -- author: M. Kersten -- date: 2010-Dec-31 -- (Feb2010 Release, loading time 90 min, storage 26.7 GB) -- (Oct2010 Releaase, loading time 84 min, storage 26.7 GB) -- reported loading in Oct'09 was MonetDB 3.6h Infobright 2.45h -- Direct loading using COPY ...LOCKED. -- development branch Feb2011 Dec 31,2010, load time 36 min, storage 26.7 GB -- loading 260 files, 14 had input errors and where skipped -- input was the original csv files from the source. No tweaking. -- All queries were ran in one user session with the server -- Each query was run three times to illustrate cold/hot timing -- As a frame of reference the results reported in the Percona blog are copied. -- And the times reported on the Greemplum blog, as they mention MonetDB too. -- http://community.greenplum.com/showthread.php?t=111&referrerid=9 -- All timings are obtained using 'mclient -lsql -t', which includes turn around time of results. -- The 'Feb2010 release code' is as we deliver binaries for production systems.. -- The other timings are code compiled for debugging, which includes lots of system sanity checks. -- MONETDB FEB2010 RELEASE IS SCHEDULED FOR PUBLIC DOWNLOAD FEB 28, 2010. -- For comparison the best (hot) times (in sec) are summmarized below -- MonetDB Infobright LucidDB Greenplum MonetDB -- Aug'09 SNE Feb'10 -- Q0 29.9 4.9 --- 2.9 1.8 -- Q1 7.9 12.1 54.8 3.0 0.7 -- Q2 0.9 6.4 21.5 2.2 0.5 -- Q3 1.7 7.3 23.9 2.5 0.6 -- Q4 0.3 1.0 2.9 0.5 0.2 -- Q5 0.5 2.9 9.7 0.8 0.2 -- Q6 12.5 21.8 22.6 2.9 1.8 -- Q7 27.9 8.6 22.9 9.2 1.3 -- Q8a 0.6 1.7 9.0 0.5 0.3 -- Q8b 1.1 3.7 16.9 1.0 0.5 -- Q8c 1.7 5.4 26.8 1.4 0.8 -- Q8d 2.2 7.2 37.9 1.8 1.0 -- Q8e 29.1 17.4 105.6 4.5 3.2 -- Q9 6.3 0.3 0.9 5.1 0.9 --Q0 reported MonetDB 29.9s Infobright 4.19s with t as (select "Year","Month",count(*) as c1 from ontime group by "Year","Month") select AVG(c1) FROM t; -- Oct2010 debugging code 10.1 6.7 6.6 -- Feb2010 debugging code 9.7 3.8 3.8 -- Feb2010 release code 7.0 1.8 1.8 --Q1 reported MonetDB 7.9s InfoBright 12.13 LucidDB 54.8 SELECT "DayOfWeek", count(*) AS c FROM ontime WHERE "Year" BETWEEN 2000 AND 2008 GROUP BY "DayOfWeek" ORDER BY c DESC; -- Oct2010 debugging code 1.8 1.8 1.7 -- Feb2010 debugging code 1.6 1.5 1.5 -- Feb2010 release code 0.7 0.7 0.7 --Q2 reported MonetDB 0.9s Infobright 6.37 LucidDB 21.5 SELECT "DayOfWeek", count(*) AS c FROM ontime WHERE "DepDelay">10 AND "Year" BETWEEN 2000 AND 2008 GROUP BY "DayOfWeek" ORDER BY c DESC; -- Oct2010 debugging code 3.3 0.9 0.9 -- Feb2010 debugging code 3.5 0.9 0.9 -- Feb2010 release code 3.2 0.4 0.5 --Q3 reported MonetDB 1.7s Infobright 7.29 LucidDB 23.9 SELECT "Origin", count(*) AS c FROM ontime WHERE "DepDelay">10 AND "Year" BETWEEN 2000 AND 2008 GROUP BY "Origin" ORDER BY c DESC LIMIT 10; -- Oct2010 debugging code 2.2 1.1 1.1 -- Feb2010 debugging code 2.8 1.2 1.2 -- Feb2010 release code 2.1 0.6 0.6 --Q4 reported MonetDB 0.27 Infobright 0.99 LucidDB 2.9 SELECT "Carrier", count(*) FROM ontime WHERE "DepDelay">10 AND "Year"=2007 GROUP BY "Carrier" ORDER BY 2 DESC; -- Oct2010 debugging code 0.5 0.5 0.5 -- Feb2010 debugging code 0.6 0.5 0.5 -- Feb2010 release code 0.4 0.2 0.2 --Q5 reported MonetDB 0.5 Infobright 2.92 LucidDB 9.7 WITH t AS (SELECT "Carrier", count(*) AS c FROM ontime WHERE "DepDelay">10 AND "Year"=2007 GROUP BY "Carrier"), t2 AS (SELECT "Carrier", count(*) AS c2 FROM ontime WHERE "Year"=2007 GROUP BY "Carrier") SELECT t."Carrier", c, c2, c*1000/c2 as c3 FROM t JOIN t2 ON (t."Carrier"=t2."Carrier") ORDER BY c3 DESC; -- Oct2010 debugging code 0.5 0.5 0.5 -- Feb2010 debugging code 0.6 0.6 0.6 -- Feb2010 release code 0.3 0.2 0.2 --Q6 reported MonetDB 12.5 Infobright 21.83 LucidDB 22.6 WITH t AS (SELECT "Carrier", count(*) AS c FROM ontime WHERE "DepDelay">10 AND "Year">=2000 and "Year" <=2008 GROUP BY "Carrier"), t2 AS (SELECT "Carrier", count(*) AS c2 FROM ontime WHERE "Year"=2007 GROUP BY "Carrier") SELECT t."Carrier", c, c2, c*1000/c2 as c3 FROM t JOIN t2 ON (t."Carrier"=t2."Carrier") ORDER BY c3 DESC; -- Oct2010 debugging code 2.1 1.0 1.0 -- Feb2010 debugging code 2.2 1.0 1.1 -- Feb2010 release code 1.6 0.5 0.5 -- Q7 reportd MonetDB 27.9 Infobright 8.59 LucidDB 22.9 with t as (select "Year",count(*)*1000 as c1 from ontime WHERE "DepDelay">10 GROUP BY "Year"), t2 as (select "Year",count(*) as c2 from ontime GROUP BY "Year") select t."Year", c1/c2 FROM t JOIN t2 ON (t."Year"=t2."Year"); -- Oct2010 debugging code 5.5 2.5 2.5 -- Feb2010 debugging code 4.5 2.3 2.3 -- Feb2010 release code 4.6 1.2 1.3 --Q8 reported -- Infobright MonetDB LucidDB -- 1y, 1.74s, 0.55s 9.0s -- 2y, 3.68s, 1.10s 16.9s -- 3y, 5.44s, 1.69s 26.8s -- 4y, 7.22s, 2.12s 37.9s -- 10y, 17.42s, 29.14s 105.6s SELECT "DestCityName", COUNT( DISTINCT "OriginCityName") FROM ontime WHERE "Year" BETWEEN 1990 and 1999 GROUP BY "DestCityName" ORDER BY 2 DESC LIMIT 10; -- Feb2010 debugging code -- 1990-1990 0.5 0.5 0.5 -- 1990-1991 1.5 1.1 1.1 -- 1990-1992 1.8 1.6 1.6 -- 1990-1993 2.3 2.1 2.1 -- 1990-1999 7.1 5.8 5.8 -- Feb2010 release code -- 1990-1990 0.5 0.3 0.3 -- 1990-1991 0.7 0.5 0.5 -- 1990-1992 0.9 0.8 0.8 -- 1990-1993 1.2 1.0 1.0 -- 1990-1999 4.6 3.2 3.2 --Q9 MonetDB 6.3 Infobright 0.3 LucidDB 0.9 select "Year",count(*) as c1 from ontime group by "Year"; -- Oct2010 debugging code 1.4 1.4 1.4 -- Feb2010 debugging code 1.4 1.4 1.4 -- Feb2010 release code 0.9 0.9 0.9 -- Comments: probably I uses a summary table -- The schema used: CREATE TABLE "ontime" ( "Year" smallint DEFAULT NULL, "Quarter" tinyint DEFAULT NULL, "Month" tinyint DEFAULT NULL, "DayofMonth" tinyint DEFAULT NULL, "DayOfWeek" tinyint DEFAULT NULL, "FlightDate" date DEFAULT NULL, "UniqueCarrier" char(7) DEFAULT NULL, "AirlineID" decimal(8,2) DEFAULT NULL, "Carrier" char(2) DEFAULT NULL, "TailNum" varchar(50) DEFAULT NULL, "FlightNum" varchar(10) DEFAULT NULL, "Origin" char(5) DEFAULT NULL, "OriginCityName" varchar(100) DEFAULT NULL, "OriginState" char(2) DEFAULT NULL, "OriginStateFips" varchar(10) DEFAULT NULL, "OriginStateName" varchar(100) DEFAULT NULL, "OriginWac" decimal(8,2) DEFAULT NULL, "Dest" char(5) DEFAULT NULL, "DestCityName" varchar(100) DEFAULT NULL, "DestState" char(2) DEFAULT NULL, "DestStateFips" varchar(10) DEFAULT NULL, "DestStateName" varchar(100) DEFAULT NULL, "DestWac" decimal(8,2) DEFAULT NULL, "CRSDepTime" decimal(8,2) DEFAULT NULL, "DepTime" decimal(8,2) DEFAULT NULL, "DepDelay" decimal(8,2) DEFAULT NULL, "DepDelayMinutes" decimal(8,2) DEFAULT NULL, "DepDel15" decimal(8,2) DEFAULT NULL, "DepartureDelayGroups" decimal(8,2) DEFAULT NULL, "DepTimeBlk" varchar(20) DEFAULT NULL, "TaxiOut" decimal(8,2) DEFAULT NULL, "WheelsOff" decimal(8,2) DEFAULT NULL, "WheelsOn" decimal(8,2) DEFAULT NULL, "TaxiIn" decimal(8,2) DEFAULT NULL, "CRSArrTime" decimal(8,2) DEFAULT NULL, "ArrTime" decimal(8,2) DEFAULT NULL, "ArrDelay" decimal(8,2) DEFAULT NULL, "ArrDelayMinutes" decimal(8,2) DEFAULT NULL, "ArrDel15" decimal(8,2) DEFAULT NULL, "ArrivalDelayGroups" decimal(8,2) DEFAULT NULL, "ArrTimeBlk" varchar(20) DEFAULT NULL, "Cancelled" tinyint DEFAULT NULL, "CancellationCode" char(1) DEFAULT NULL, "Diverted" tinyint DEFAULT NULL, "CRSElapsedTime" decimal(8,2) DEFAULT NULL, "ActualElapsedTime" decimal(8,2) DEFAULT NULL, "AirTime" decimal(8,2) DEFAULT NULL, "Flights" decimal(8,2) DEFAULT NULL, "Distance" decimal(8,2) DEFAULT NULL, "DistanceGroup" tinyint DEFAULT NULL, "CarrierDelay" decimal(8,2) DEFAULT NULL, "WeatherDelay" decimal(8,2) DEFAULT NULL, "NASDelay" decimal(8,2) DEFAULT NULL, "SecurityDelay" decimal(8,2) DEFAULT NULL, "LateAircraftDelay" decimal(8,2) DEFAULT NULL, "FirstDepTime" varchar(10) DEFAULT NULL, "TotalAddGTime" varchar(10) DEFAULT NULL, "LongestAddGTime" varchar(10) DEFAULT NULL, "DivAirportLandings" varchar(10) DEFAULT NULL, "DivReachedDest" varchar(10) DEFAULT NULL, "DivActualElapsedTime" varchar(10) DEFAULT NULL, "DivArrDelay" varchar(10) DEFAULT NULL, "DivDistance" varchar(10) DEFAULT NULL, "Div1Airport" varchar(10) DEFAULT NULL, "Div1WheelsOn" varchar(10) DEFAULT NULL, "Div1TotalGTime" varchar(10) DEFAULT NULL, "Div1LongestGTime" varchar(10) DEFAULT NULL, "Div1WheelsOff" varchar(10) DEFAULT NULL, "Div1TailNum" varchar(10) DEFAULT NULL, "Div2Airport" varchar(10) DEFAULT NULL, "Div2WheelsOn" varchar(10) DEFAULT NULL, "Div2TotalGTime" varchar(10) DEFAULT NULL, "Div2LongestGTime" varchar(10) DEFAULT NULL, "Div2WheelsOff" varchar(10) DEFAULT NULL, "Div2TailNum" varchar(10) DEFAULT NULL, "Div3Airport" varchar(10) DEFAULT NULL, "Div3WheelsOn" varchar(10) DEFAULT NULL, "Div3TotalGTime" varchar(10) DEFAULT NULL, "Div3LongestGTime" varchar(10) DEFAULT NULL, "Div3WheelsOff" varchar(10) DEFAULT NULL, "Div3TailNum" varchar(10) DEFAULT NULL, "Div4Airport" varchar(10) DEFAULT NULL, "Div4WheelsOn" varchar(10) DEFAULT NULL, "Div4TotalGTime" varchar(10) DEFAULT NULL, "Div4LongestGTime" varchar(10) DEFAULT NULL, "Div4WheelsOff" varchar(10) DEFAULT NULL, "Div4TailNum" varchar(10) DEFAULT NULL, "Div5Airport" varchar(10) DEFAULT NULL, "Div5WheelsOn" varchar(10) DEFAULT NULL, "Div5TotalGTime" varchar(10) DEFAULT NULL, "Div5LongestGTime" varchar(10) DEFAULT NULL, "Div5WheelsOff" varchar(10) DEFAULT NULL, "Div5TailNum" varchar(10) DEFAULT NULL ) ; -- the loading script after starting MonetDB and creation of the table. for i in 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 do for j in 1 2 3 4 5 6 7 8 9 10 11 12 do echo date " load $i $j" mclient -lsql -t -s "COPY 12000000 OFFSET 2 RECORDS INTO ontime2 FROM '/export/scratch0/mk/benchmark/data/On_Time_On_Time_Performance_${i}_${j}.csv' USING DELIMITERS ',','\n','\"' NULL AS ''; " done done -- a better way would have been to give a list of files instead of one -- nov26: does not make a difference if you run all files in one query. -- The reference database scheme uses varchar(10) for many columns that would fit a char or benefit from encoding -- In MonetDB the varchar(10) is read as a contract, preparing for longer strings. This explains most storage overhead. -- A simple dictionary encoding would save > 50G -- The value distribution obtained is, in square brackets the storage size needed select count(distinct "FlightDate") from ontime; -- 7398 [2] select count(distinct "UniqueCarrier") from ontime; --29 [1] select count(distinct "Carrier") from ontime; --31 [1] select count(distinct "TailNum") from ontime; --9306 [2] select count(distinct "FlightNum") from ontime; --8159 [2] select count(distinct "Origin") from ontime; --348 [2] select count(distinct "OriginCityName") from ontime; --354 [2] select count(distinct "OriginState") from ontime; --55 [1] select count(distinct "OriginStateFips") from ontime;--55 [1] select count(distinct "OriginStateName") from ontime;--55 [1] select count(distinct "Dest") from ontime; -- 353 [2] select count(distinct "DestCityName") from ontime; --356 [2] select count(distinct "DestState") from ontime; --55 [1] select count(distinct "DestStateFips") from ontime; --55 [1] select count(distinct "DestStateName") from ontime; --55 [1] select count(distinct "DepTimeBlk") from ontime; --19 [1] select count(distinct "ArrTimeBlk") from ontime; --19 [1] select count(distinct "CancellationCode") from ontime; --5 [1] select count(distinct "FirstDepTime") from ontime; --1223 [2] select count(distinct "TotalAddGTime") from ontime; --287 [2] select count(distinct "LongestAddGTime") from ontime; --281 [2] select count(distinct "DivAirportLandings") from ontime; --5 [1] select count(distinct "DivReachedDest") from ontime; --2 [1] select count(distinct "DivActualElapsedTime") from ontime; --809 [2] select count(distinct "DivArrDelay") from ontime; --737 [2] select count(distinct "DivDistance") from ontime; --531 [2] select count(distinct "Div1Airport") from ontime; --312 [2] select count(distinct "Div1WheelsOn") from ontime;--1282 [2] select count(distinct "Div1TotalGTime") from ontime; --240 [1] select count(distinct "Div1LongestGTime") from ontime; --234 [1] select count(distinct "Div1WheelsOff") from ontime; --1240 [2] select count(distinct "Div1TailNum") from ontime; --4211 [2] select count(distinct "Div2Airport") from ontime; --83 [1] select count(distinct "Div2WheelsOn") from ontime; --182 [1] select count(distinct "Div2TotalGTime") from ontime; --56 [1] select count(distinct "Div2LongestGTime") from ontime; --52 [1] select count(distinct "Div2WheelsOff") from ontime;--60 [1] select count(distinct "Div2TailNum") from ontime; --2 [1] select count(distinct "Div3Airport") from ontime; --2 [1] select count(distinct "Div3WheelsOn") from ontime; --1 [1] select count(distinct "Div3TotalGTime") from ontime; --1 [1] select count(distinct "Div3LongestGTime") from ontime; --1 [1] select count(distinct "Div3WheelsOff") from ontime; --1 [1] select count(distinct "Div3TailNum") from ontime; --1 [1] select count(distinct "Div4Airport") from ontime; --1 [1] select count(distinct "Div4WheelsOn") from ontime; --0 [1] select count(distinct "Div4TotalGTime") from ontime; --0 [1] select count(distinct "Div4LongestGTime") from ontime; --1 [1] select count(distinct "Div4WheelsOff") from ontime; --1 [1] select count(distinct "Div4TailNum") from ontime; --1 [1] select count(distinct "Div5Airport") from ontime; --0 [1] select count(distinct "Div5WheelsOn") from ontime; --1 [1] select count(distinct "Div5TotalGTime") from ontime; --0 [1] select count(distinct "Div5LongestGTime") from ontime; --0 [1] select count(distinct "Div5WheelsOff") from ontime; --1 [1] select count(distinct "Div5TailNum") from ontime; --1 [1]