diff --git a/doc/data/tips.csv b/doc/data/tips.csv new file mode 100644 index 0000000000000..c4558cce4ce36 --- /dev/null +++ b/doc/data/tips.csv @@ -0,0 +1,245 @@ +obs,totbill,tip,sex,smoker,day,time,size +1,16.99, 1.01,F,No,Sun,Night,2 +2,10.34, 1.66,M,No,Sun,Night,3 +3,21.01, 3.50,M,No,Sun,Night,3 +4,23.68, 3.31,M,No,Sun,Night,2 +5,24.59, 3.61,F,No,Sun,Night,4 +6,25.29, 4.71,M,No,Sun,Night,4 +7, 8.77, 2.00,M,No,Sun,Night,2 +8,26.88, 3.12,M,No,Sun,Night,4 +9,15.04, 1.96,M,No,Sun,Night,2 +10,14.78, 3.23,M,No,Sun,Night,2 +11,10.27, 1.71,M,No,Sun,Night,2 +12,35.26, 5.00,F,No,Sun,Night,4 +13,15.42, 1.57,M,No,Sun,Night,2 +14,18.43, 3.00,M,No,Sun,Night,4 +15,14.83, 3.02,F,No,Sun,Night,2 +16,21.58, 3.92,M,No,Sun,Night,2 +17,10.33, 1.67,F,No,Sun,Night,3 +18,16.29, 3.71,M,No,Sun,Night,3 +19,16.97, 3.50,F,No,Sun,Night,3 +20,20.65, 3.35,M,No,Sat,Night,3 +21,17.92, 4.08,M,No,Sat,Night,2 +22,20.29, 2.75,F,No,Sat,Night,2 +23,15.77, 2.23,F,No,Sat,Night,2 +24,39.42, 7.58,M,No,Sat,Night,4 +25,19.82, 3.18,M,No,Sat,Night,2 +26,17.81, 2.34,M,No,Sat,Night,4 +27,13.37, 2.00,M,No,Sat,Night,2 +28,12.69, 2.00,M,No,Sat,Night,2 +29,21.70, 4.30,M,No,Sat,Night,2 +30,19.65, 3.00,F,No,Sat,Night,2 +31, 9.55, 1.45,M,No,Sat,Night,2 +32,18.35, 2.50,M,No,Sat,Night,4 +33,15.06, 3.00,F,No,Sat,Night,2 +34,20.69, 2.45,F,No,Sat,Night,4 +35,17.78, 3.27,M,No,Sat,Night,2 +36,24.06, 3.60,M,No,Sat,Night,3 +37,16.31, 2.00,M,No,Sat,Night,3 +38,16.93, 3.07,F,No,Sat,Night,3 +39,18.69, 2.31,M,No,Sat,Night,3 +40,31.27, 5.00,M,No,Sat,Night,3 +41,16.04, 2.24,M,No,Sat,Night,3 +42,17.46, 2.54,M,No,Sun,Night,2 +43,13.94, 3.06,M,No,Sun,Night,2 +44, 9.68, 1.32,M,No,Sun,Night,2 +45,30.40, 5.60,M,No,Sun,Night,4 +46,18.29, 3.00,M,No,Sun,Night,2 +47,22.23, 5.00,M,No,Sun,Night,2 +48,32.40, 6.00,M,No,Sun,Night,4 +49,28.55, 2.05,M,No,Sun,Night,3 +50,18.04, 3.00,M,No,Sun,Night,2 +51,12.54, 2.50,M,No,Sun,Night,2 +52,10.29, 2.60,F,No,Sun,Night,2 +53,34.81, 5.20,F,No,Sun,Night,4 +54, 9.94, 1.56,M,No,Sun,Night,2 +55,25.56, 4.34,M,No,Sun,Night,4 +56,19.49, 3.51,M,No,Sun,Night,2 +57,38.01, 3.00,M,Yes,Sat,Night,4 +58,26.41, 1.50,F,No,Sat,Night,2 +59,11.24, 1.76,M,Yes,Sat,Night,2 +60,48.27, 6.73,M,No,Sat,Night,4 +61,20.29, 3.21,M,Yes,Sat,Night,2 +62,13.81, 2.00,M,Yes,Sat,Night,2 +63,11.02, 1.98,M,Yes,Sat,Night,2 +64,18.29, 3.76,M,Yes,Sat,Night,4 +65,17.59, 2.64,M,No,Sat,Night,3 +66,20.08, 3.15,M,No,Sat,Night,3 +67,16.45, 2.47,F,No,Sat,Night,2 +68, 3.07, 1.00,F,Yes,Sat,Night,1 +69,20.23, 2.01,M,No,Sat,Night,2 +70,15.01, 2.09,M,Yes,Sat,Night,2 +71,12.02, 1.97,M,No,Sat,Night,2 +72,17.07, 3.00,F,No,Sat,Night,3 +73,26.86, 3.14,F,Yes,Sat,Night,2 +74,25.28, 5.00,F,Yes,Sat,Night,2 +75,14.73, 2.20,F,No,Sat,Night,2 +76,10.51, 1.25,M,No,Sat,Night,2 +77,17.92, 3.08,M,Yes,Sat,Night,2 +78,27.20, 4.00,M,No,Thu,Day,4 +79,22.76, 3.00,M,No,Thu,Day,2 +80,17.29, 2.71,M,No,Thu,Day,2 +81,19.44, 3.00,M,Yes,Thu,Day,2 +82,16.66, 3.40,M,No,Thu,Day,2 +83,10.07, 1.83,F,No,Thu,Day,1 +84,32.68, 5.00,M,Yes,Thu,Day,2 +85,15.98, 2.03,M,No,Thu,Day,2 +86,34.83, 5.17,F,No,Thu,Day,4 +87,13.03, 2.00,M,No,Thu,Day,2 +88,18.28, 4.00,M,No,Thu,Day,2 +89,24.71, 5.85,M,No,Thu,Day,2 +90,21.16, 3.00,M,No,Thu,Day,2 +91,28.97, 3.00,M,Yes,Fri,Night,2 +92,22.49, 3.50,M,No,Fri,Night,2 +93, 5.75, 1.00,F,Yes,Fri,Night,2 +94,16.32, 4.30,F,Yes,Fri,Night,2 +95,22.75, 3.25,F,No,Fri,Night,2 +96,40.17, 4.73,M,Yes,Fri,Night,4 +97,27.28, 4.00,M,Yes,Fri,Night,2 +98,12.03, 1.50,M,Yes,Fri,Night,2 +99,21.01, 3.00,M,Yes,Fri,Night,2 +100,12.46, 1.50,M,No,Fri,Night,2 +101,11.35, 2.50,F,Yes,Fri,Night,2 +102,15.38, 3.00,F,Yes,Fri,Night,2 +103,44.30, 2.50,F,Yes,Sat,Night,3 +104,22.42, 3.48,F,Yes,Sat,Night,2 +105,20.92, 4.08,F,No,Sat,Night,2 +106,15.36, 1.64,M,Yes,Sat,Night,2 +107,20.49, 4.06,M,Yes,Sat,Night,2 +108,25.21, 4.29,M,Yes,Sat,Night,2 +109,18.24, 3.76,M,No,Sat,Night,2 +110,14.31, 4.00,F,Yes,Sat,Night,2 +111,14.00, 3.00,M,No,Sat,Night,2 +112, 7.25, 1.00,F,No,Sat,Night,1 +113,38.07, 4.00,M,No,Sun,Night,3 +114,23.95, 2.55,M,No,Sun,Night,2 +115,25.71, 4.00,F,No,Sun,Night,3 +116,17.31, 3.50,F,No,Sun,Night,2 +117,29.93, 5.07,M,No,Sun,Night,4 +118,10.65, 1.50,F,No,Thu,Day,2 +119,12.43, 1.80,F,No,Thu,Day,2 +120,24.08, 2.92,F,No,Thu,Day,4 +121,11.69, 2.31,M,No,Thu,Day,2 +122,13.42, 1.68,F,No,Thu,Day,2 +123,14.26, 2.50,M,No,Thu,Day,2 +124,15.95, 2.00,M,No,Thu,Day,2 +125,12.48, 2.52,F,No,Thu,Day,2 +126,29.80, 4.20,F,No,Thu,Day,6 +127, 8.52, 1.48,M,No,Thu,Day,2 +128,14.52, 2.00,F,No,Thu,Day,2 +129,11.38, 2.00,F,No,Thu,Day,2 +130,22.82, 2.18,M,No,Thu,Day,3 +131,19.08, 1.50,M,No,Thu,Day,2 +132,20.27, 2.83,F,No,Thu,Day,2 +133,11.17, 1.50,F,No,Thu,Day,2 +134,12.26, 2.00,F,No,Thu,Day,2 +135,18.26, 3.25,F,No,Thu,Day,2 +136, 8.51, 1.25,F,No,Thu,Day,2 +137,10.33, 2.00,F,No,Thu,Day,2 +138,14.15, 2.00,F,No,Thu,Day,2 +139,16.00, 2.00,M,Yes,Thu,Day,2 +140,13.16, 2.75,F,No,Thu,Day,2 +141,17.47, 3.50,F,No,Thu,Day,2 +142,34.30, 6.70,M,No,Thu,Day,6 +143,41.19, 5.00,M,No,Thu,Day,5 +144,27.05, 5.00,F,No,Thu,Day,6 +145,16.43, 2.30,F,No,Thu,Day,2 +146, 8.35, 1.50,F,No,Thu,Day,2 +147,18.64, 1.36,F,No,Thu,Day,3 +148,11.87, 1.63,F,No,Thu,Day,2 +149, 9.78, 1.73,M,No,Thu,Day,2 +150, 7.51, 2.00,M,No,Thu,Day,2 +151,14.07, 2.50,M,No,Sun,Night,2 +152,13.13, 2.00,M,No,Sun,Night,2 +153,17.26, 2.74,M,No,Sun,Night,3 +154,24.55, 2.00,M,No,Sun,Night,4 +155,19.77, 2.00,M,No,Sun,Night,4 +156,29.85, 5.14,F,No,Sun,Night,5 +157,48.17, 5.00,M,No,Sun,Night,6 +158,25.00, 3.75,F,No,Sun,Night,4 +159,13.39, 2.61,F,No,Sun,Night,2 +160,16.49, 2.00,M,No,Sun,Night,4 +161,21.50, 3.50,M,No,Sun,Night,4 +162,12.66, 2.50,M,No,Sun,Night,2 +163,16.21, 2.00,F,No,Sun,Night,3 +164,13.81, 2.00,M,No,Sun,Night,2 +165,17.51, 3.00,F,Yes,Sun,Night,2 +166,24.52, 3.48,M,No,Sun,Night,3 +167,20.76, 2.24,M,No,Sun,Night,2 +168,31.71, 4.50,M,No,Sun,Night,4 +169,10.59, 1.61,F,Yes,Sat,Night,2 +170,10.63, 2.00,F,Yes,Sat,Night,2 +171,50.81,10.00,M,Yes,Sat,Night,3 +172,15.81, 3.16,M,Yes,Sat,Night,2 +173, 7.25, 5.15,M,Yes,Sun,Night,2 +174,31.85, 3.18,M,Yes,Sun,Night,2 +175,16.82, 4.00,M,Yes,Sun,Night,2 +176,32.90, 3.11,M,Yes,Sun,Night,2 +177,17.89, 2.00,M,Yes,Sun,Night,2 +178,14.48, 2.00,M,Yes,Sun,Night,2 +179, 9.60, 4.00,F,Yes,Sun,Night,2 +180,34.63, 3.55,M,Yes,Sun,Night,2 +181,34.65, 3.68,M,Yes,Sun,Night,4 +182,23.33, 5.65,M,Yes,Sun,Night,2 +183,45.35, 3.50,M,Yes,Sun,Night,3 +184,23.17, 6.50,M,Yes,Sun,Night,4 +185,40.55, 3.00,M,Yes,Sun,Night,2 +186,20.69, 5.00,M,No,Sun,Night,5 +187,20.90, 3.50,F,Yes,Sun,Night,3 +188,30.46, 2.00,M,Yes,Sun,Night,5 +189,18.15, 3.50,F,Yes,Sun,Night,3 +190,23.10, 4.00,M,Yes,Sun,Night,3 +191,15.69, 1.50,M,Yes,Sun,Night,2 +192,19.81, 4.19,F,Yes,Thu,Day,2 +193,28.44, 2.56,M,Yes,Thu,Day,2 +194,15.48, 2.02,M,Yes,Thu,Day,2 +195,16.58, 4.00,M,Yes,Thu,Day,2 +196, 7.56, 1.44,M,No,Thu,Day,2 +197,10.34, 2.00,M,Yes,Thu,Day,2 +198,43.11, 5.00,F,Yes,Thu,Day,4 +199,13.00, 2.00,F,Yes,Thu,Day,2 +200,13.51, 2.00,M,Yes,Thu,Day,2 +201,18.71, 4.00,M,Yes,Thu,Day,3 +202,12.74, 2.01,F,Yes,Thu,Day,2 +203,13.00, 2.00,F,Yes,Thu,Day,2 +204,16.40, 2.50,F,Yes,Thu,Day,2 +205,20.53, 4.00,M,Yes,Thu,Day,4 +206,16.47, 3.23,F,Yes,Thu,Day,3 +207,26.59, 3.41,M,Yes,Sat,Night,3 +208,38.73, 3.00,M,Yes,Sat,Night,4 +209,24.27, 2.03,M,Yes,Sat,Night,2 +210,12.76, 2.23,F,Yes,Sat,Night,2 +211,30.06, 2.00,M,Yes,Sat,Night,3 +212,25.89, 5.16,M,Yes,Sat,Night,4 +213,48.33, 9.00,M,No,Sat,Night,4 +214,13.27, 2.50,F,Yes,Sat,Night,2 +215,28.17, 6.50,F,Yes,Sat,Night,3 +216,12.90, 1.10,F,Yes,Sat,Night,2 +217,28.15, 3.00,M,Yes,Sat,Night,5 +218,11.59, 1.50,M,Yes,Sat,Night,2 +219, 7.74, 1.44,M,Yes,Sat,Night,2 +220,30.14, 3.09,F,Yes,Sat,Night,4 +221,12.16, 2.20,M,Yes,Fri,Day,2 +222,13.42, 3.48,F,Yes,Fri,Day,2 +223, 8.58, 1.92,M,Yes,Fri,Day,1 +224,15.98, 3.00,F,No,Fri,Day,3 +225,13.42, 1.58,M,Yes,Fri,Day,2 +226,16.27, 2.50,F,Yes,Fri,Day,2 +227,10.09, 2.00,F,Yes,Fri,Day,2 +228,20.45, 3.00,M,No,Sat,Night,4 +229,13.28, 2.72,M,No,Sat,Night,2 +230,22.12, 2.88,F,Yes,Sat,Night,2 +231,24.01, 2.00,M,Yes,Sat,Night,4 +232,15.69, 3.00,M,Yes,Sat,Night,3 +233,11.61, 3.39,M,No,Sat,Night,2 +234,10.77, 1.47,M,No,Sat,Night,2 +235,15.53, 3.00,M,Yes,Sat,Night,2 +236,10.07, 1.25,M,No,Sat,Night,2 +237,12.60, 1.00,M,Yes,Sat,Night,2 +238,32.83, 1.17,M,Yes,Sat,Night,2 +239,35.83, 4.67,F,No,Sat,Night,3 +240,29.03, 5.92,M,No,Sat,Night,3 +241,27.18, 2.00,F,Yes,Sat,Night,2 +242,22.67, 2.00,M,Yes,Sat,Night,2 +243,17.82, 1.75,M,No,Sat,Night,2 +244,18.78, 3.00,F,No,Thu,Night,2 diff --git a/doc/source/index.rst b/doc/source/index.rst index b4b9231b6d34d..4ef6f1b105dd7 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -122,6 +122,7 @@ See the package overview for more detail about what's in the library. reshaping timeseries visualization + rplot io sparse gotchas diff --git a/doc/source/rplot.rst b/doc/source/rplot.rst new file mode 100644 index 0000000000000..7153d5323c805 --- /dev/null +++ b/doc/source/rplot.rst @@ -0,0 +1,160 @@ +.. currentmodule:: pandas +.. _rplot: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + tips_data = read_csv('data/tips.csv') + iris_data = read_csv('data/iris.data') + from pandas import read_csv + from pandas.tools.plotting import radviz + import pandas.tools.rplot as rplot + plt.close('all') + +************************** +Trellis plotting interface +************************** + +-------- +Examples +-------- + +RPlot is a flexible API for producing Trellis plots. These plots allow you to arrange data in a rectangular grid by values of certain attributes. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='totbill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomHistogram()) + + @savefig rplot1_tips.png width=8in + plot.render(plt.gcf()) + +In the example above, data from the tips data set is arranged by the attributes 'sex' and 'smoker'. Since both of those attributes can take on one of two values, the resulting grid has two columns and two rows. A histogram is displayed for each cell of the grid. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='totbill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomDensity()) + + @savefig rplot2_tips.png width=8in + plot.render(plt.gcf()) + +Example above is the same as previous except the plot is set to kernel density estimation. This shows how easy it is to have different plots for the same Trellis structure. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='totbill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomScatter()) + plot.add(rplot.GeomPolyFit(degree=2)) + + @savefig rplot3_tips.png width=8in + plot.render(plt.gcf()) + +The plot above shows that it is possible to have two or more plots for the same data displayed on the same Trellis grid cell. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='totbill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomScatter()) + plot.add(rplot.GeomDensity2D()) + + @savefig rplot4_tips.png width=8in + plot.render(plt.gcf()) + +Above is a similar plot but with 2D kernel desnity estimation plot superimposed. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='totbill', y='tip') + plot.add(rplot.TrellisGrid(['sex', '.'])) + plot.add(rplot.GeomHistogram()) + + @savefig rplot5_tips.png width=8in + plot.render(plt.gcf()) + +It is possible to only use one attribute for grouping data. The example above only uses 'sex' attribute. If the second grouping attribute is not specified, the plots will be arranged in a column. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='totbill', y='tip') + plot.add(rplot.TrellisGrid(['.', 'smoker'])) + plot.add(rplot.GeomHistogram()) + + @savefig rplot6_tips.png width=8in + plot.render(plt.gcf()) + +If the first grouping attribute is not specified the plots will be arranged in a row. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='totbill', y='tip') + plot.add(rplot.TrellisGrid(['.', 'smoker'])) + plot.add(rplot.GeomHistogram()) + + plot = rplot.RPlot(tips_data, x='tip', y='totbill') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomPoint(size=80.0, colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'), alpha=1.0)) + + @savefig rplot7_tips.png width=8in + plot.render(plt.gcf()) + +As shown above, scatter plots are also possible. Scatter plots allow you to map various data attributes to graphical properties of the plot. In the example above the colour and shape of the scatter plot graphical objects is mapped to 'day' and 'size' attributes respectively. You use scale objects to specify these mappings. The list of scale classes is given below with initialization arguments for quick reference. + +------ +Scales +------ + +:: + + ScaleGradient(column, colour1, colour2) + +This one allows you to map an attribute (specified by parameter column) value to the colour of a graphical object. The larger the value of the attribute the closer the colour will be to colour2, the smaller the value, the closer it will be to colour1. + +:: + + ScaleGradient2(column, colour1, colour2, colour3) + +The same as ScaleGradient but interpolates linearly between three colours instead of two. + +:: + + ScaleSize(column, min_size, max_size, transform) + +Map attribute value to size of the graphical object. Parameter min_size (default 5.0) is the minimum size of the graphical object, max_size (default 100.0) is the maximum size and transform is a one argument function that will be used to transform the attribute value (defaults to lambda x: x). + +:: + + ScaleShape(column) + +Map the shape of the object to attribute value. The attribute has to be categorical. + +:: + + ScaleRandomColour(column) + +Assign a random colour to a value of categorical attribute specified by column. diff --git a/pandas/tests/data/tips.csv b/pandas/tests/data/tips.csv new file mode 100644 index 0000000000000..856a65a69e647 --- /dev/null +++ b/pandas/tests/data/tips.csv @@ -0,0 +1,245 @@ +total_bill,tip,sex,smoker,day,time,size +16.99,1.01,Female,No,Sun,Dinner,2 +10.34,1.66,Male,No,Sun,Dinner,3 +21.01,3.5,Male,No,Sun,Dinner,3 +23.68,3.31,Male,No,Sun,Dinner,2 +24.59,3.61,Female,No,Sun,Dinner,4 +25.29,4.71,Male,No,Sun,Dinner,4 +8.77,2.0,Male,No,Sun,Dinner,2 +26.88,3.12,Male,No,Sun,Dinner,4 +15.04,1.96,Male,No,Sun,Dinner,2 +14.78,3.23,Male,No,Sun,Dinner,2 +10.27,1.71,Male,No,Sun,Dinner,2 +35.26,5.0,Female,No,Sun,Dinner,4 +15.42,1.57,Male,No,Sun,Dinner,2 +18.43,3.0,Male,No,Sun,Dinner,4 +14.83,3.02,Female,No,Sun,Dinner,2 +21.58,3.92,Male,No,Sun,Dinner,2 +10.33,1.67,Female,No,Sun,Dinner,3 +16.29,3.71,Male,No,Sun,Dinner,3 +16.97,3.5,Female,No,Sun,Dinner,3 +20.65,3.35,Male,No,Sat,Dinner,3 +17.92,4.08,Male,No,Sat,Dinner,2 +20.29,2.75,Female,No,Sat,Dinner,2 +15.77,2.23,Female,No,Sat,Dinner,2 +39.42,7.58,Male,No,Sat,Dinner,4 +19.82,3.18,Male,No,Sat,Dinner,2 +17.81,2.34,Male,No,Sat,Dinner,4 +13.37,2.0,Male,No,Sat,Dinner,2 +12.69,2.0,Male,No,Sat,Dinner,2 +21.7,4.3,Male,No,Sat,Dinner,2 +19.65,3.0,Female,No,Sat,Dinner,2 +9.55,1.45,Male,No,Sat,Dinner,2 +18.35,2.5,Male,No,Sat,Dinner,4 +15.06,3.0,Female,No,Sat,Dinner,2 +20.69,2.45,Female,No,Sat,Dinner,4 +17.78,3.27,Male,No,Sat,Dinner,2 +24.06,3.6,Male,No,Sat,Dinner,3 +16.31,2.0,Male,No,Sat,Dinner,3 +16.93,3.07,Female,No,Sat,Dinner,3 +18.69,2.31,Male,No,Sat,Dinner,3 +31.27,5.0,Male,No,Sat,Dinner,3 +16.04,2.24,Male,No,Sat,Dinner,3 +17.46,2.54,Male,No,Sun,Dinner,2 +13.94,3.06,Male,No,Sun,Dinner,2 +9.68,1.32,Male,No,Sun,Dinner,2 +30.4,5.6,Male,No,Sun,Dinner,4 +18.29,3.0,Male,No,Sun,Dinner,2 +22.23,5.0,Male,No,Sun,Dinner,2 +32.4,6.0,Male,No,Sun,Dinner,4 +28.55,2.05,Male,No,Sun,Dinner,3 +18.04,3.0,Male,No,Sun,Dinner,2 +12.54,2.5,Male,No,Sun,Dinner,2 +10.29,2.6,Female,No,Sun,Dinner,2 +34.81,5.2,Female,No,Sun,Dinner,4 +9.94,1.56,Male,No,Sun,Dinner,2 +25.56,4.34,Male,No,Sun,Dinner,4 +19.49,3.51,Male,No,Sun,Dinner,2 +38.01,3.0,Male,Yes,Sat,Dinner,4 +26.41,1.5,Female,No,Sat,Dinner,2 +11.24,1.76,Male,Yes,Sat,Dinner,2 +48.27,6.73,Male,No,Sat,Dinner,4 +20.29,3.21,Male,Yes,Sat,Dinner,2 +13.81,2.0,Male,Yes,Sat,Dinner,2 +11.02,1.98,Male,Yes,Sat,Dinner,2 +18.29,3.76,Male,Yes,Sat,Dinner,4 +17.59,2.64,Male,No,Sat,Dinner,3 +20.08,3.15,Male,No,Sat,Dinner,3 +16.45,2.47,Female,No,Sat,Dinner,2 +3.07,1.0,Female,Yes,Sat,Dinner,1 +20.23,2.01,Male,No,Sat,Dinner,2 +15.01,2.09,Male,Yes,Sat,Dinner,2 +12.02,1.97,Male,No,Sat,Dinner,2 +17.07,3.0,Female,No,Sat,Dinner,3 +26.86,3.14,Female,Yes,Sat,Dinner,2 +25.28,5.0,Female,Yes,Sat,Dinner,2 +14.73,2.2,Female,No,Sat,Dinner,2 +10.51,1.25,Male,No,Sat,Dinner,2 +17.92,3.08,Male,Yes,Sat,Dinner,2 +27.2,4.0,Male,No,Thur,Lunch,4 +22.76,3.0,Male,No,Thur,Lunch,2 +17.29,2.71,Male,No,Thur,Lunch,2 +19.44,3.0,Male,Yes,Thur,Lunch,2 +16.66,3.4,Male,No,Thur,Lunch,2 +10.07,1.83,Female,No,Thur,Lunch,1 +32.68,5.0,Male,Yes,Thur,Lunch,2 +15.98,2.03,Male,No,Thur,Lunch,2 +34.83,5.17,Female,No,Thur,Lunch,4 +13.03,2.0,Male,No,Thur,Lunch,2 +18.28,4.0,Male,No,Thur,Lunch,2 +24.71,5.85,Male,No,Thur,Lunch,2 +21.16,3.0,Male,No,Thur,Lunch,2 +28.97,3.0,Male,Yes,Fri,Dinner,2 +22.49,3.5,Male,No,Fri,Dinner,2 +5.75,1.0,Female,Yes,Fri,Dinner,2 +16.32,4.3,Female,Yes,Fri,Dinner,2 +22.75,3.25,Female,No,Fri,Dinner,2 +40.17,4.73,Male,Yes,Fri,Dinner,4 +27.28,4.0,Male,Yes,Fri,Dinner,2 +12.03,1.5,Male,Yes,Fri,Dinner,2 +21.01,3.0,Male,Yes,Fri,Dinner,2 +12.46,1.5,Male,No,Fri,Dinner,2 +11.35,2.5,Female,Yes,Fri,Dinner,2 +15.38,3.0,Female,Yes,Fri,Dinner,2 +44.3,2.5,Female,Yes,Sat,Dinner,3 +22.42,3.48,Female,Yes,Sat,Dinner,2 +20.92,4.08,Female,No,Sat,Dinner,2 +15.36,1.64,Male,Yes,Sat,Dinner,2 +20.49,4.06,Male,Yes,Sat,Dinner,2 +25.21,4.29,Male,Yes,Sat,Dinner,2 +18.24,3.76,Male,No,Sat,Dinner,2 +14.31,4.0,Female,Yes,Sat,Dinner,2 +14.0,3.0,Male,No,Sat,Dinner,2 +7.25,1.0,Female,No,Sat,Dinner,1 +38.07,4.0,Male,No,Sun,Dinner,3 +23.95,2.55,Male,No,Sun,Dinner,2 +25.71,4.0,Female,No,Sun,Dinner,3 +17.31,3.5,Female,No,Sun,Dinner,2 +29.93,5.07,Male,No,Sun,Dinner,4 +10.65,1.5,Female,No,Thur,Lunch,2 +12.43,1.8,Female,No,Thur,Lunch,2 +24.08,2.92,Female,No,Thur,Lunch,4 +11.69,2.31,Male,No,Thur,Lunch,2 +13.42,1.68,Female,No,Thur,Lunch,2 +14.26,2.5,Male,No,Thur,Lunch,2 +15.95,2.0,Male,No,Thur,Lunch,2 +12.48,2.52,Female,No,Thur,Lunch,2 +29.8,4.2,Female,No,Thur,Lunch,6 +8.52,1.48,Male,No,Thur,Lunch,2 +14.52,2.0,Female,No,Thur,Lunch,2 +11.38,2.0,Female,No,Thur,Lunch,2 +22.82,2.18,Male,No,Thur,Lunch,3 +19.08,1.5,Male,No,Thur,Lunch,2 +20.27,2.83,Female,No,Thur,Lunch,2 +11.17,1.5,Female,No,Thur,Lunch,2 +12.26,2.0,Female,No,Thur,Lunch,2 +18.26,3.25,Female,No,Thur,Lunch,2 +8.51,1.25,Female,No,Thur,Lunch,2 +10.33,2.0,Female,No,Thur,Lunch,2 +14.15,2.0,Female,No,Thur,Lunch,2 +16.0,2.0,Male,Yes,Thur,Lunch,2 +13.16,2.75,Female,No,Thur,Lunch,2 +17.47,3.5,Female,No,Thur,Lunch,2 +34.3,6.7,Male,No,Thur,Lunch,6 +41.19,5.0,Male,No,Thur,Lunch,5 +27.05,5.0,Female,No,Thur,Lunch,6 +16.43,2.3,Female,No,Thur,Lunch,2 +8.35,1.5,Female,No,Thur,Lunch,2 +18.64,1.36,Female,No,Thur,Lunch,3 +11.87,1.63,Female,No,Thur,Lunch,2 +9.78,1.73,Male,No,Thur,Lunch,2 +7.51,2.0,Male,No,Thur,Lunch,2 +14.07,2.5,Male,No,Sun,Dinner,2 +13.13,2.0,Male,No,Sun,Dinner,2 +17.26,2.74,Male,No,Sun,Dinner,3 +24.55,2.0,Male,No,Sun,Dinner,4 +19.77,2.0,Male,No,Sun,Dinner,4 +29.85,5.14,Female,No,Sun,Dinner,5 +48.17,5.0,Male,No,Sun,Dinner,6 +25.0,3.75,Female,No,Sun,Dinner,4 +13.39,2.61,Female,No,Sun,Dinner,2 +16.49,2.0,Male,No,Sun,Dinner,4 +21.5,3.5,Male,No,Sun,Dinner,4 +12.66,2.5,Male,No,Sun,Dinner,2 +16.21,2.0,Female,No,Sun,Dinner,3 +13.81,2.0,Male,No,Sun,Dinner,2 +17.51,3.0,Female,Yes,Sun,Dinner,2 +24.52,3.48,Male,No,Sun,Dinner,3 +20.76,2.24,Male,No,Sun,Dinner,2 +31.71,4.5,Male,No,Sun,Dinner,4 +10.59,1.61,Female,Yes,Sat,Dinner,2 +10.63,2.0,Female,Yes,Sat,Dinner,2 +50.81,10.0,Male,Yes,Sat,Dinner,3 +15.81,3.16,Male,Yes,Sat,Dinner,2 +7.25,5.15,Male,Yes,Sun,Dinner,2 +31.85,3.18,Male,Yes,Sun,Dinner,2 +16.82,4.0,Male,Yes,Sun,Dinner,2 +32.9,3.11,Male,Yes,Sun,Dinner,2 +17.89,2.0,Male,Yes,Sun,Dinner,2 +14.48,2.0,Male,Yes,Sun,Dinner,2 +9.6,4.0,Female,Yes,Sun,Dinner,2 +34.63,3.55,Male,Yes,Sun,Dinner,2 +34.65,3.68,Male,Yes,Sun,Dinner,4 +23.33,5.65,Male,Yes,Sun,Dinner,2 +45.35,3.5,Male,Yes,Sun,Dinner,3 +23.17,6.5,Male,Yes,Sun,Dinner,4 +40.55,3.0,Male,Yes,Sun,Dinner,2 +20.69,5.0,Male,No,Sun,Dinner,5 +20.9,3.5,Female,Yes,Sun,Dinner,3 +30.46,2.0,Male,Yes,Sun,Dinner,5 +18.15,3.5,Female,Yes,Sun,Dinner,3 +23.1,4.0,Male,Yes,Sun,Dinner,3 +15.69,1.5,Male,Yes,Sun,Dinner,2 +19.81,4.19,Female,Yes,Thur,Lunch,2 +28.44,2.56,Male,Yes,Thur,Lunch,2 +15.48,2.02,Male,Yes,Thur,Lunch,2 +16.58,4.0,Male,Yes,Thur,Lunch,2 +7.56,1.44,Male,No,Thur,Lunch,2 +10.34,2.0,Male,Yes,Thur,Lunch,2 +43.11,5.0,Female,Yes,Thur,Lunch,4 +13.0,2.0,Female,Yes,Thur,Lunch,2 +13.51,2.0,Male,Yes,Thur,Lunch,2 +18.71,4.0,Male,Yes,Thur,Lunch,3 +12.74,2.01,Female,Yes,Thur,Lunch,2 +13.0,2.0,Female,Yes,Thur,Lunch,2 +16.4,2.5,Female,Yes,Thur,Lunch,2 +20.53,4.0,Male,Yes,Thur,Lunch,4 +16.47,3.23,Female,Yes,Thur,Lunch,3 +26.59,3.41,Male,Yes,Sat,Dinner,3 +38.73,3.0,Male,Yes,Sat,Dinner,4 +24.27,2.03,Male,Yes,Sat,Dinner,2 +12.76,2.23,Female,Yes,Sat,Dinner,2 +30.06,2.0,Male,Yes,Sat,Dinner,3 +25.89,5.16,Male,Yes,Sat,Dinner,4 +48.33,9.0,Male,No,Sat,Dinner,4 +13.27,2.5,Female,Yes,Sat,Dinner,2 +28.17,6.5,Female,Yes,Sat,Dinner,3 +12.9,1.1,Female,Yes,Sat,Dinner,2 +28.15,3.0,Male,Yes,Sat,Dinner,5 +11.59,1.5,Male,Yes,Sat,Dinner,2 +7.74,1.44,Male,Yes,Sat,Dinner,2 +30.14,3.09,Female,Yes,Sat,Dinner,4 +12.16,2.2,Male,Yes,Fri,Lunch,2 +13.42,3.48,Female,Yes,Fri,Lunch,2 +8.58,1.92,Male,Yes,Fri,Lunch,1 +15.98,3.0,Female,No,Fri,Lunch,3 +13.42,1.58,Male,Yes,Fri,Lunch,2 +16.27,2.5,Female,Yes,Fri,Lunch,2 +10.09,2.0,Female,Yes,Fri,Lunch,2 +20.45,3.0,Male,No,Sat,Dinner,4 +13.28,2.72,Male,No,Sat,Dinner,2 +22.12,2.88,Female,Yes,Sat,Dinner,2 +24.01,2.0,Male,Yes,Sat,Dinner,4 +15.69,3.0,Male,Yes,Sat,Dinner,3 +11.61,3.39,Male,No,Sat,Dinner,2 +10.77,1.47,Male,No,Sat,Dinner,2 +15.53,3.0,Male,Yes,Sat,Dinner,2 +10.07,1.25,Male,No,Sat,Dinner,2 +12.6,1.0,Male,Yes,Sat,Dinner,2 +32.83,1.17,Male,Yes,Sat,Dinner,2 +35.83,4.67,Female,No,Sat,Dinner,3 +29.03,5.92,Male,No,Sat,Dinner,3 +27.18,2.0,Female,Yes,Sat,Dinner,2 +22.67,2.0,Male,Yes,Sat,Dinner,2 +17.82,1.75,Male,No,Sat,Dinner,2 +18.78,3.0,Female,No,Thur,Dinner,2 diff --git a/pandas/tests/test_rplot.py b/pandas/tests/test_rplot.py new file mode 100644 index 0000000000000..f21296f9a952b --- /dev/null +++ b/pandas/tests/test_rplot.py @@ -0,0 +1,269 @@ +import unittest +import pandas.tools.rplot as rplot +from pandas import read_csv +import os +import matplotlib.pyplot as plt +import pdb + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + +def between(a, b, x): + """Check if x is in the somewhere between a and b. + + Parameters: + ----------- + a: float, interval start + b: float, interval end + x: float, value to test for + + Returns: + -------- + True if x is between a and b, False otherwise + """ + if a < b: + return x >= a and x <= b + else: + return x <= a and x >= b + +class TestUtilityFunctions(unittest.TestCase): + """ + Tests for RPlot utility functions. + """ + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + + def test_make_aes1(self): + aes = rplot.make_aes() + self.assertTrue(aes['x'] is None) + self.assertTrue(aes['y'] is None) + self.assertTrue(aes['size'] is None) + self.assertTrue(aes['colour'] is None) + self.assertTrue(aes['shape'] is None) + self.assertTrue(aes['alpha'] is None) + self.assertTrue(type(aes) is dict) + + def test_make_aes2(self): + with self.assertRaises(ValueError): + rplot.make_aes(size=rplot.ScaleShape('test')) + with self.assertRaises(ValueError): + rplot.make_aes(colour=rplot.ScaleShape('test')) + with self.assertRaises(ValueError): + rplot.make_aes(shape=rplot.ScaleSize('test')) + with self.assertRaises(ValueError): + rplot.make_aes(alpha=rplot.ScaleShape('test')) + + def test_dictionary_union(self): + dict1 = {1 : 1, 2 : 2, 3 : 3} + dict2 = {1 : 1, 2 : 2, 4 : 4} + union = rplot.dictionary_union(dict1, dict2) + self.assertEqual(len(union), 4) + keys = union.keys() + self.assertTrue(1 in keys) + self.assertTrue(2 in keys) + self.assertTrue(3 in keys) + self.assertTrue(4 in keys) + self.assertTrue(rplot.dictionary_union(dict1, {}) == dict1) + self.assertTrue(rplot.dictionary_union({}, dict1) == dict1) + self.assertTrue(rplot.dictionary_union({}, {}) == {}) + + def test_merge_aes(self): + layer1 = rplot.Layer(size=rplot.ScaleSize('test')) + layer2 = rplot.Layer(shape=rplot.ScaleShape('test')) + rplot.merge_aes(layer1, layer2) + self.assertTrue(isinstance(layer2.aes['size'], rplot.ScaleSize)) + self.assertTrue(isinstance(layer2.aes['shape'], rplot.ScaleShape)) + self.assertTrue(layer2.aes['size'] == layer1.aes['size']) + for key in layer2.aes.keys(): + if key != 'size' and key != 'shape': + self.assertTrue(layer2.aes[key] is None) + + def test_sequence_layers(self): + layer1 = rplot.Layer(self.data) + layer2 = rplot.GeomPoint(x='SepalLength', y='SepalWidth', size=rplot.ScaleSize('PetalLength')) + layer3 = rplot.GeomPolyFit(2) + result = rplot.sequence_layers([layer1, layer2, layer3]) + self.assertEqual(len(result), 3) + last = result[-1] + self.assertEqual(last.aes['x'], 'SepalLength') + self.assertEqual(last.aes['y'], 'SepalWidth') + self.assertTrue(isinstance(last.aes['size'], rplot.ScaleSize)) + self.assertTrue(self.data is last.data) + self.assertTrue(rplot.sequence_layers([layer1])[0] is layer1) + +class TestTrellis(unittest.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/tips.csv') + self.data = read_csv(path, sep=',') + layer1 = rplot.Layer(self.data) + layer2 = rplot.GeomPoint(x='total_bill', y='tip') + layer3 = rplot.GeomPolyFit(2) + self.layers = rplot.sequence_layers([layer1, layer2, layer3]) + self.trellis1 = rplot.TrellisGrid(['sex', 'smoker']) + self.trellis2 = rplot.TrellisGrid(['sex', '.']) + self.trellis3 = rplot.TrellisGrid(['.', 'smoker']) + self.trellised1 = self.trellis1.trellis(self.layers) + self.trellised2 = self.trellis2.trellis(self.layers) + self.trellised3 = self.trellis3.trellis(self.layers) + + def test_grid_sizes(self): + self.assertEqual(len(self.trellised1), 3) + self.assertEqual(len(self.trellised2), 3) + self.assertEqual(len(self.trellised3), 3) + self.assertEqual(len(self.trellised1[0]), 2) + self.assertEqual(len(self.trellised1[0][0]), 2) + self.assertEqual(len(self.trellised2[0]), 2) + self.assertEqual(len(self.trellised2[0][0]), 1) + self.assertEqual(len(self.trellised3[0]), 1) + self.assertEqual(len(self.trellised3[0][0]), 2) + self.assertEqual(len(self.trellised1[1]), 2) + self.assertEqual(len(self.trellised1[1][0]), 2) + self.assertEqual(len(self.trellised2[1]), 2) + self.assertEqual(len(self.trellised2[1][0]), 1) + self.assertEqual(len(self.trellised3[1]), 1) + self.assertEqual(len(self.trellised3[1][0]), 2) + self.assertEqual(len(self.trellised1[2]), 2) + self.assertEqual(len(self.trellised1[2][0]), 2) + self.assertEqual(len(self.trellised2[2]), 2) + self.assertEqual(len(self.trellised2[2][0]), 1) + self.assertEqual(len(self.trellised3[2]), 1) + self.assertEqual(len(self.trellised3[2][0]), 2) + + def test_trellis_cols_rows(self): + self.assertEqual(self.trellis1.cols, 2) + self.assertEqual(self.trellis1.rows, 2) + self.assertEqual(self.trellis2.cols, 1) + self.assertEqual(self.trellis2.rows, 2) + self.assertEqual(self.trellis3.cols, 2) + self.assertEqual(self.trellis3.rows, 1) + +class TestScaleGradient(unittest.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + self.gradient = rplot.ScaleGradient("SepalLength", colour1=(0.2, 0.3, 0.4), colour2=(0.8, 0.7, 0.6)) + + def test_gradient(self): + for index in range(len(self.data)): + row = self.data.irow(index) + r, g, b = self.gradient(self.data, index) + r1, g1, b1 = self.gradient.colour1 + r2, g2, b2 = self.gradient.colour2 + self.assertTrue(between(r1, r2, r)) + self.assertTrue(between(g1, g2, g)) + self.assertTrue(between(b1, b2, b)) + +class TestScaleGradient2(unittest.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + self.gradient = rplot.ScaleGradient2("SepalLength", colour1=(0.2, 0.3, 0.4), colour2=(0.8, 0.7, 0.6), colour3=(0.5, 0.5, 0.5)) + + def test_gradient2(self): + for index in range(len(self.data)): + row = self.data.irow(index) + r, g, b = self.gradient(self.data, index) + r1, g1, b1 = self.gradient.colour1 + r2, g2, b2 = self.gradient.colour2 + r3, g3, b3 = self.gradient.colour3 + value = row[self.gradient.column] + a_ = min(self.data[self.gradient.column]) + b_ = max(self.data[self.gradient.column]) + scaled = (value - a_) / (b_ - a_) + if scaled < 0.5: + self.assertTrue(between(r1, r2, r)) + self.assertTrue(between(g1, g2, g)) + self.assertTrue(between(b1, b2, b)) + else: + self.assertTrue(between(r2, r3, r)) + self.assertTrue(between(g2, g3, g)) + self.assertTrue(between(b2, b3, b)) + +class TestScaleRandomColour(unittest.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + self.colour = rplot.ScaleRandomColour('SepalLength') + + def test_random_colour(self): + for index in range(len(self.data)): + colour = self.colour(self.data, index) + self.assertEqual(len(colour), 3) + r, g, b = colour + self.assertGreaterEqual(r, 0.0) + self.assertGreaterEqual(g, 0.0) + self.assertGreaterEqual(b, 0.0) + self.assertLessEqual(r, 1.0) + self.assertLessEqual(g, 1.0) + self.assertLessEqual(b, 1.0) + +class TestScaleConstant(unittest.TestCase): + def test_scale_constant(self): + scale = rplot.ScaleConstant(1.0) + self.assertEqual(scale(None, None), 1.0) + scale = rplot.ScaleConstant("test") + self.assertEqual(scale(None, None), "test") + +class TestScaleSize(unittest.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + self.scale1 = rplot.ScaleShape('Name') + self.scale2 = rplot.ScaleShape('PetalLength') + + def test_scale_size(self): + for index in range(len(self.data)): + marker = self.scale1(self.data, index) + self.assertTrue(marker in ['o', '+', 's', '*', '^', '<', '>', 'v', '|', 'x']) + + def test_scale_overflow(self): + with self.assertRaises(ValueError): + for index in range(len(self.data)): + self.scale2(self.data, index) + +class TestRPlot(unittest.TestCase): + def test_rplot1(self): + path = os.path.join(curpath(), 'data/tips.csv') + plt.figure() + self.data = read_csv(path, sep=',') + self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') + self.plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'))) + self.fig = plt.gcf() + self.plot.render(self.fig) + + def test_rplot2(self): + path = os.path.join(curpath(), 'data/tips.csv') + plt.figure() + self.data = read_csv(path, sep=',') + self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') + self.plot.add(rplot.TrellisGrid(['.', 'smoker'])) + self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'))) + self.fig = plt.gcf() + self.plot.render(self.fig) + + def test_rplot3(self): + path = os.path.join(curpath(), 'data/tips.csv') + plt.figure() + self.data = read_csv(path, sep=',') + self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') + self.plot.add(rplot.TrellisGrid(['sex', '.'])) + self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'))) + self.fig = plt.gcf() + self.plot.render(self.fig) + + def test_rplot_iris(self): + path = os.path.join(curpath(), 'data/iris.csv') + plt.figure() + self.data = read_csv(path, sep=',') + plot = rplot.RPlot(self.data, x='SepalLength', y='SepalWidth') + plot.add(rplot.GeomPoint(colour=rplot.ScaleGradient('PetalLength', colour1=(0.0, 1.0, 0.5), colour2=(1.0, 0.0, 0.5)), + size=rplot.ScaleSize('PetalWidth', min_size=10.0, max_size=200.0), + shape=rplot.ScaleShape('Name'))) + self.fig = plt.gcf() + plot.render(self.fig) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/pandas/tools/rplot.py b/pandas/tools/rplot.py new file mode 100644 index 0000000000000..31b6dda3aae3a --- /dev/null +++ b/pandas/tools/rplot.py @@ -0,0 +1,884 @@ +import numpy as np +import scipy.stats as stats +import matplotlib.pyplot as plt +import random +import pdb +from copy import deepcopy + +# +# TODO: +# * Make sure legends work properly +# + +class Scale: + """ + Base class for mapping between graphical and data attributes. + """ + pass + +class ScaleGradient(Scale): + """ + A mapping between a data attribute value and a + point in colour space between two specified colours. + """ + def __init__(self, column, colour1, colour2): + """Initialize ScaleGradient instance. + + Parameters: + ----------- + column: string, pandas DataFrame column name + colour1: tuple, 3 element tuple with float values representing an RGB colour + colour2: tuple, 3 element tuple with float values representing an RGB colour + """ + self.column = column + self.colour1 = colour1 + self.colour2 = colour2 + self.categorical = False + + def __call__(self, data, index): + """Return a colour corresponding to data attribute value. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + + Returns: + -------- + A three element tuple representing an RGB somewhere between colour1 and colour2 + """ + x = data[self.column].iget(index) + a = min(data[self.column]) + b = max(data[self.column]) + r1, g1, b1 = self.colour1 + r2, g2, b2 = self.colour2 + x_scaled = (x - a) / (b - a) + return (r1 + (r2 - r1) * x_scaled, + g1 + (g2 - g1) * x_scaled, + b1 + (b2 - b1) * x_scaled) + +class ScaleGradient2(Scale): + """ + Create a mapping between a data attribute value and a + point in colour space in a line of three specified colours. + """ + def __init__(self, column, colour1, colour2, colour3): + """Initialize ScaleGradient2 instance. + + Parameters: + ----------- + column: string, pandas DataFrame column name + colour1: tuple, 3 element tuple with float values representing an RGB colour + colour2: tuple, 3 element tuple with float values representing an RGB colour + colour3: tuple, 3 element tuple with float values representing an RGB colour + """ + self.column = column + self.colour1 = colour1 + self.colour2 = colour2 + self.colour3 = colour3 + self.categorical = False + + def __call__(self, data, index): + """Return a colour corresponding to data attribute value. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + + Returns: + -------- + A three element tuple representing an RGB somewhere along the line + of colour1, colour2 and colour3 + """ + x = data[self.column].iget(index) + a = min(data[self.column]) + b = max(data[self.column]) + r1, g1, b1 = self.colour1 + r2, g2, b2 = self.colour2 + r3, g3, b3 = self.colour3 + x_scaled = (x - a) / (b - a) + if x_scaled < 0.5: + x_scaled *= 2.0 + return (r1 + (r2 - r1) * x_scaled, + g1 + (g2 - g1) * x_scaled, + b1 + (b2 - b1) * x_scaled) + else: + x_scaled = (x_scaled - 0.5) * 2.0 + return (r2 + (r3 - r2) * x_scaled, + g2 + (g3 - g2) * x_scaled, + b2 + (b3 - b2) * x_scaled) + +class ScaleSize(Scale): + """ + Provide a mapping between a DataFrame column and matplotlib + scatter plot shape size. + """ + def __init__(self, column, min_size=5.0, max_size=100.0, transform=lambda x: x): + """Initialize ScaleSize instance. + + Parameters: + ----------- + column: string, a column name + min_size: float, minimum point size + max_size: float, maximum point size + transform: a one argument function of form float -> float (e.g. lambda x: log(x)) + """ + self.column = column + self.min_size = min_size + self.max_size = max_size + self.transform = transform + self.categorical = False + + def __call__(self, data, index): + """Return matplotlib scatter plot marker shape size. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + """ + x = data[self.column].iget(index) + a = float(min(data[self.column])) + b = float(max(data[self.column])) + return self.transform(self.min_size + ((x - a) / (b - a)) * + (self.max_size - self.min_size)) + +class ScaleShape(Scale): + """ + Provides a mapping between matplotlib marker shapes + and attribute values. + """ + def __init__(self, column): + """Initialize ScaleShape instance. + + Parameters: + ----------- + column: string, pandas DataFrame column name + """ + self.column = column + self.shapes = ['o', '+', 's', '*', '^', '<', '>', 'v', '|', 'x'] + self.legends = set([]) + self.categorical = True + + def __call__(self, data, index): + """Returns a matplotlib marker identifier. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + + Returns: + -------- + a matplotlib marker identifier + """ + values = sorted(list(set(data[self.column]))) + if len(values) > len(self.shapes): + raise ValueError("Too many different values of the categorical attribute for ScaleShape") + x = data[self.column].iget(index) + return self.shapes[values.index(x)] + +class ScaleRandomColour(Scale): + """ + Maps a random colour to a DataFrame attribute. + """ + def __init__(self, column): + """Initialize ScaleRandomColour instance. + + Parameters: + ----------- + column: string, pandas DataFrame column name + """ + self.column = column + self.categorical = True + + def __call__(self, data, index): + """Return a tuple of three floats, representing + an RGB colour. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + """ + random.seed(data[self.column].iget(index)) + return [random.random() for _ in range(3)] + +class ScaleConstant(Scale): + """ + Constant returning scale. Usually used automatically. + """ + def __init__(self, value): + """Initialize ScaleConstant instance. + + Parameters: + ----------- + value: any Python value to be returned when called + """ + self.value = value + self.categorical = False + + def __call__(self, data, index): + """Return the constant value. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + + Returns: + -------- + A constant value specified during initialisation + """ + return self.value + +def default_aes(x=None, y=None): + """Create the default aesthetics dictionary. + + Parameters: + ----------- + x: string, DataFrame column name + y: string, DataFrame column name + + Returns: + -------- + a dictionary with aesthetics bindings + """ + return { + 'x' : x, + 'y' : y, + 'size' : ScaleConstant(40.0), + 'colour' : ScaleConstant('grey'), + 'shape' : ScaleConstant('o'), + 'alpha' : ScaleConstant(1.0), + } + +def make_aes(x=None, y=None, size=None, colour=None, shape=None, alpha=None): + """Create an empty aesthetics dictionary. + + Parameters: + ----------- + x: string, DataFrame column name + y: string, DataFrame column name + size: function, binding for size attribute of Geoms + colour: function, binding for colour attribute of Geoms + shape: function, binding for shape attribute of Geoms + alpha: function, binding for alpha attribute of Geoms + + Returns: + -------- + a dictionary with aesthetics bindings + """ + if not hasattr(size, '__call__') and size is not None: + size = ScaleConstant(size) + if not hasattr(colour, '__call__') and colour is not None: + colour = ScaleConstant(colour) + if not hasattr(shape, '__call__') and shape is not None: + shape = ScaleConstant(shape) + if not hasattr(alpha, '__call__') and alpha is not None: + alpha = ScaleConstant(alpha) + if any([isinstance(size, scale) for scale in [ScaleConstant, ScaleSize]]) or size is None: + pass + else: + raise ValueError('size mapping should be done through ScaleConstant or ScaleSize') + if any([isinstance(colour, scale) for scale in [ScaleConstant, ScaleGradient, ScaleGradient2, ScaleRandomColour]]) or colour is None: + pass + else: + raise ValueError('colour mapping should be done through ScaleConstant, ScaleRandomColour, ScaleGradient or ScaleGradient2') + if any([isinstance(shape, scale) for scale in [ScaleConstant, ScaleShape]]) or shape is None: + pass + else: + raise ValueError('shape mapping should be done through ScaleConstant or ScaleShape') + if any([isinstance(alpha, scale) for scale in [ScaleConstant]]) or alpha is None: + pass + else: + raise ValueError('alpha mapping should be done through ScaleConstant') + return { + 'x' : x, + 'y' : y, + 'size' : size, + 'colour' : colour, + 'shape' : shape, + 'alpha' : alpha, + } + +class Layer: + """ + Layer object representing a single plot layer. + """ + def __init__(self, data=None, **kwds): + """Initialize layer object. + + Parameters: + ----------- + data: pandas DataFrame instance + aes: aesthetics dictionary with bindings + """ + self.data = data + self.aes = make_aes(**kwds) + self.legend = {} + + def work(self, fig=None, ax=None): + """Do the drawing (usually) work. + + Parameters: + ----------- + fig: matplotlib figure + ax: matplotlib axis object + + Returns: + -------- + a tuple with the same figure and axis instances + """ + return fig, ax + +class GeomPoint(Layer): + def work(self, fig=None, ax=None): + """Render the layer on a matplotlib axis. + You can specify either a figure or an axis to draw on. + + Parameters: + ----------- + fig: matplotlib figure object + ax: matplotlib axis object to draw on + + Returns: + -------- + fig, ax: matplotlib figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + for index in range(len(self.data)): + row = self.data.irow(index) + x = row[self.aes['x']] + y = row[self.aes['y']] + size_scaler = self.aes['size'] + colour_scaler = self.aes['colour'] + shape_scaler = self.aes['shape'] + alpha = self.aes['alpha'] + size_value = size_scaler(self.data, index) + colour_value = colour_scaler(self.data, index) + marker_value = shape_scaler(self.data, index) + alpha_value = alpha(self.data, index) + patch = ax.scatter(x, y, + s=size_value, + c=colour_value, + marker=marker_value, + alpha=alpha_value) + label = [] + if colour_scaler.categorical: + label += [colour_scaler.column, row[colour_scaler.column]] + if shape_scaler.categorical: + label += [shape_scaler.column, row[shape_scaler.column]] + self.legend[tuple(label)] = patch + ax.set_xlabel(self.aes['x']) + ax.set_ylabel(self.aes['y']) + return fig, ax + +class GeomPolyFit(Layer): + """ + Draw a polynomial fit of specified degree. + """ + def __init__(self, degree, lw=2.0, colour='grey'): + """Initialize GeomPolyFit object. + + Parameters: + ----------- + degree: an integer, polynomial degree + lw: line width + colour: matplotlib colour + """ + self.degree = degree + self.lw = lw + self.colour = colour + Layer.__init__(self) + + def work(self, fig=None, ax=None): + """Draw the polynomial fit on matplotlib figure or axis + + Parameters: + ----------- + fig: matplotlib figure + ax: matplotlib axis + + Returns: + -------- + a tuple with figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + from numpy.polynomial.polynomial import polyfit + from numpy.polynomial.polynomial import polyval + x = self.data[self.aes['x']] + y = self.data[self.aes['y']] + min_x = min(x) + max_x = max(x) + c = polyfit(x, y, self.degree) + x_ = np.linspace(min_x, max_x, len(x)) + y_ = polyval(x_, c) + ax.plot(x_, y_, lw=self.lw, c=self.colour) + return fig, ax + +class GeomScatter(Layer): + """ + An efficient scatter plot, use this instead of GeomPoint for speed. + """ + def __init__(self, marker='o', colour='lightblue', alpha=1.0): + """Initialize GeomScatter instance. + + Parameters: + ----------- + marker: matplotlib marker string + colour: matplotlib colour + alpha: matplotlib alpha + """ + self.marker = marker + self.colour = colour + self.alpha = alpha + Layer.__init__(self) + + def work(self, fig=None, ax=None): + """Draw a scatter plot on matplotlib figure or axis + + Parameters: + ----------- + fig: matplotlib figure + ax: matplotlib axis + + Returns: + -------- + a tuple with figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + x = self.data[self.aes['x']] + y = self.data[self.aes['y']] + ax.scatter(x, y, marker=self.marker, c=self.colour, alpha=self.alpha) + return fig, ax + +class GeomHistogram(Layer): + """ + An efficient histogram, use this instead of GeomBar for speed. + """ + def __init__(self, bins=10, colour='lightblue'): + """Initialize GeomHistogram instance. + + Parameters: + ----------- + bins: integer, number of histogram bins + colour: matplotlib colour + """ + self.bins = bins + self.colour = colour + Layer.__init__(self) + + def work(self, fig=None, ax=None): + """Draw a histogram on matplotlib figure or axis + + Parameters: + ----------- + fig: matplotlib figure + ax: matplotlib axis + + Returns: + -------- + a tuple with figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + x = self.data[self.aes['x']] + ax.hist(x, self.bins, facecolor=self.colour) + ax.set_xlabel(self.aes['x']) + return fig, ax + +class GeomDensity(Layer): + """ + A kernel density estimation plot. + """ + def work(self, fig=None, ax=None): + """Draw a one dimensional kernel density plot. + You can specify either a figure or an axis to draw on. + + Parameters: + ----------- + fig: matplotlib figure object + ax: matplotlib axis object to draw on + + Returns: + -------- + fig, ax: matplotlib figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + from scipy.stats import gaussian_kde + x = self.data[self.aes['x']] + gkde = gaussian_kde(x) + ind = np.linspace(x.min(), x.max(), 200) + ax.plot(ind, gkde.evaluate(ind)) + return fig, ax + +class GeomDensity2D(Layer): + def work(self, fig=None, ax=None): + """Draw a two dimensional kernel density plot. + You can specify either a figure or an axis to draw on. + + Parameters: + ----------- + fig: matplotlib figure object + ax: matplotlib axis object to draw on + + Returns: + -------- + fig, ax: matplotlib figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + x = self.data[self.aes['x']] + y = self.data[self.aes['y']] + rvs = np.array([x, y]) + x_min = x.min() + x_max = x.max() + y_min = y.min() + y_max = y.max() + X, Y = np.mgrid[x_min:x_max:200j, y_min:y_max:200j] + positions = np.vstack([X.ravel(), Y.ravel()]) + values = np.vstack([x, y]) + kernel = stats.gaussian_kde(values) + Z = np.reshape(kernel(positions).T, X.shape) + ax.contour(Z, extent=[x_min, x_max, y_min, y_max]) + return fig, ax + +class TrellisGrid(Layer): + def __init__(self, by): + """Initialize TreelisGrid instance. + + Parameters: + ----------- + by: column names to group by + """ + if len(by) != 2: + raise ValueError("You must give a list of length 2 to group by") + elif by[0] == '.' and by[1] == '.': + raise ValueError("At least one of grouping attributes must be not a dot") + self.by = by + + def trellis(self, layers): + """Create a trellis structure for a list of layers. + Each layer will be cloned with different data in to a two dimensional grid. + + Parameters: + ----------- + layers: a list of Layer objects + + Returns: + -------- + trellised_layers: Clones of each layer in the list arranged in a trellised latice + """ + trellised_layers = [] + for layer in layers: + data = layer.data + if self.by[0] == '.': + grouped = data.groupby(self.by[1]) + elif self.by[1] == '.': + grouped = data.groupby(self.by[0]) + else: + grouped = data.groupby(self.by) + groups = grouped.groups.keys() + if self.by[0] == '.' or self.by[1] == '.': + shingle1 = set([g for g in groups]) + else: + shingle1 = set([g[0] for g in groups]) + shingle2 = set([g[1] for g in groups]) + if self.by[0] == '.': + self.rows = 1 + self.cols = len(shingle1) + elif self.by[1] == '.': + self.rows = len(shingle1) + self.cols = 1 + else: + self.rows = len(shingle1) + self.cols = len(shingle2) + trellised = [[None for _ in range(self.cols)] for _ in range(self.rows)] + self.group_grid = [[None for _ in range(self.cols)] for _ in range(self.rows)] + row = 0 + col = 0 + for group, data in grouped: + new_layer = deepcopy(layer) + new_layer.data = data + trellised[row][col] = new_layer + self.group_grid[row][col] = group + col += 1 + if col >= self.cols: + col = 0 + row += 1 + trellised_layers.append(trellised) + return trellised_layers + +def dictionary_union(dict1, dict2): + """Take two dictionaries, return dictionary union. + + Parameters: + ----------- + dict1: Python dictionary + dict2: Python dictionary + + Returns: + -------- + A union of the dictionaries. It assumes that values + with the same keys are identical. + """ + keys1 = dict1.keys() + keys2 = dict2.keys() + result = {} + for key1 in keys1: + result[key1] = dict1[key1] + for key2 in keys2: + result[key2] = dict2[key2] + return result + +def merge_aes(layer1, layer2): + """Merges the aesthetics dictionaries for the two layers. + Look up sequence_layers function. Which layer is first and which + one is second is important. + + Parameters: + ----------- + layer1: Layer object + layer2: Layer object + """ + for key in layer2.aes.keys(): + if layer2.aes[key] is None: + layer2.aes[key] = layer1.aes[key] + +def sequence_layers(layers): + """Go through the list of layers and fill in the missing bits of information. + The basic rules are this: + * If the current layer has data set to None, take the data from previous layer. + * For each aesthetic mapping, if that mapping is set to None, take it from previous layer. + + Parameters: + ----------- + layers: a list of Layer objects + """ + for layer1, layer2 in zip(layers[:-1], layers[1:]): + if layer2.data is None: + layer2.data = layer1.data + merge_aes(layer1, layer2) + return layers + +def sequence_grids(layer_grids): + """Go through the list of layer girds and perform the same thing as sequence_layers. + + Parameters: + ----------- + layer_grids: a list of two dimensional layer grids + """ + for grid1, grid2 in zip(layer_grids[:-1], layer_grids[1:]): + for row1, row2 in zip(grid1, grid2): + for layer1, layer2 in zip(row1, row2): + if layer2.data is None: + layer2.data = layer1.data + merge_aes(layer1, layer2) + return layer_grids + +def work_grid(grid, fig): + """Take a two dimensional grid, add subplots to a figure for each cell and do layer work. + + Parameters: + ----------- + grid: a two dimensional grid of layers + fig: matplotlib figure to draw on + + Returns: + -------- + axes: a two dimensional list of matplotlib axes + """ + nrows = len(grid) + ncols = len(grid[0]) + axes = [[None for _ in range(ncols)] for _ in range(nrows)] + for row in range(nrows): + for col in range(ncols): + axes[row][col] = fig.add_subplot(nrows, ncols, ncols * row + col + 1) + grid[row][col].work(ax=axes[row][col]) + return axes + +def adjust_subplots(fig, axes, trellis, layers): + """Adjust the subtplots on matplotlib figure with the + fact that we have a trellis plot in mind. + + Parameters: + ----------- + fig: matplotlib figure + axes: a two dimensional grid of matplotlib axes + trellis: TrellisGrid object + layers: last grid of layers in the plot + """ + # Flatten the axes grid + axes = [ax for row in axes for ax in row] + min_x = min([ax.get_xlim()[0] for ax in axes]) + max_x = max([ax.get_xlim()[1] for ax in axes]) + min_y = min([ax.get_ylim()[0] for ax in axes]) + max_y = max([ax.get_ylim()[1] for ax in axes]) + [ax.set_xlim(min_x, max_x) for ax in axes] + [ax.set_ylim(min_y, max_y) for ax in axes] + for index, axis in enumerate(axes): + if index % trellis.cols == 0: + pass + else: + axis.get_yaxis().set_ticks([]) + axis.set_ylabel('') + if index / trellis.cols == trellis.rows - 1: + pass + else: + axis.get_xaxis().set_ticks([]) + axis.set_xlabel('') + if trellis.by[0] == '.': + label1 = "%s = %s" % (trellis.by[1], trellis.group_grid[index / trellis.cols][index % trellis.cols]) + label2 = None + elif trellis.by[1] == '.': + label1 = "%s = %s" % (trellis.by[0], trellis.group_grid[index / trellis.cols][index % trellis.cols]) + label2 = None + else: + label1 = "%s = %s" % (trellis.by[0], trellis.group_grid[index / trellis.cols][index % trellis.cols][0]) + label2 = "%s = %s" % (trellis.by[1], trellis.group_grid[index / trellis.cols][index % trellis.cols][1]) + if label2 is not None: + axis.table(cellText=[[label1], [label2]], + loc='top', cellLoc='center', + cellColours=[['lightgrey'], ['lightgrey']]) + else: + axis.table(cellText=[[label1]], loc='top', cellLoc='center', cellColours=[['lightgrey']]) + # Flatten the layer grid + layers = [layer for row in layers for layer in row] + legend = {} + for layer in layers: + legend = dictionary_union(legend, layer.legend) + patches = [] + labels = [] + if len(legend.keys()) == 0: + key_function = lambda tup: tup + elif len(legend.keys()[0]) == 2: + key_function = lambda tup: (tup[1]) + else: + key_function = lambda tup: (tup[1], tup[3]) + for key in sorted(legend.keys(), key=key_function): + value = legend[key] + patches.append(value) + if len(key) == 2: + col, val = key + labels.append("%s" % str(val)) + elif len(key) == 4: + col1, val1, col2, val2 = key + labels.append("%s, %s" % (str(val1), str(val2))) + else: + raise ValueError("Maximum 2 categorical attributes to display a lengend of") + if len(legend): + fig.legend(patches, labels, loc='upper right') + fig.subplots_adjust(wspace=0.05, hspace=0.2) + +class RPlot: + """ + The main plot object. Add layers to an instance of this object to create a plot. + """ + def __init__(self, data, x=None, y=None): + """Initialize RPlot instance. + + Parameters: + ----------- + data: pandas DataFrame instance + x: string, DataFrame column name + y: string, DataFrame column name + """ + self.layers = [Layer(data, **default_aes(x=x, y=y))] + trellised = False + + def add(self, layer): + """Add a layer to RPlot instance. + + Parameters: + ----------- + layer: Layer instance + """ + if not isinstance(layer, Layer): + raise TypeError("The operand on the right side of + must be a Layer instance") + self.layers.append(layer) + + def render(self, fig=None): + """Render all the layers on a matplotlib figure. + + Parameters: + ----------- + fig: matplotlib figure + """ + if fig is None: + fig = plt.gcf() + # Look for the last TrellisGrid instance in the layer list + last_trellis = None + for layer in self.layers: + if isinstance(layer, TrellisGrid): + last_trellis = layer + if last_trellis is None: + # We have a simple, non-trellised plot + new_layers = sequence_layers(self.layers) + for layer in new_layers: + layer.work(fig=fig) + legend = {} + for layer in new_layers: + legend = dictionary_union(legend, layer.legend) + patches = [] + labels = [] + if len(legend.keys()) == 0: + key_function = lambda tup: tup + elif len(legend.keys()[0]) == 2: + key_function = lambda tup: (tup[1]) + else: + key_function = lambda tup: (tup[1], tup[3]) + for key in sorted(legend.keys(), key=key_function): + value = legend[key] + patches.append(value) + if len(key) == 2: + col, val = key + labels.append("%s" % str(val)) + elif len(key) == 4: + col1, val1, col2, val2 = key + labels.append("%s, %s" % (str(val1), str(val2))) + else: + raise ValueError("Maximum 2 categorical attributes to display a lengend of") + if len(legend): + fig.legend(patches, labels, loc='upper right') + else: + # We have a trellised plot. + # First let's remove all other TrellisGrid instances from the layer list, + # including this one. + new_layers = [] + for layer in self.layers: + if not isinstance(layer, TrellisGrid): + new_layers.append(layer) + new_layers = sequence_layers(new_layers) + # Now replace the old layers by their trellised versions + new_layers = last_trellis.trellis(new_layers) + # Prepare the subplots and draw on them + new_layers = sequence_grids(new_layers) + axes_grids = [work_grid(grid, fig) for grid in new_layers] + axes_grid = axes_grids[-1] + adjust_subplots(fig, axes_grid, last_trellis, new_layers[-1]) + # And we're done + return fig \ No newline at end of file