Introduction

This post will demonstrate data ingestion and performing aggregate summaries using SQL & R. Specifically, we will connect to a remote SQL server using ODBC package. This package is currently not available in CRAN but can be downloaded from the here.

Then using the very powerful data.table package in R, we will do some relevant summaries.

ODBC Demo

The below code will load the DBI library. This library is installed when ODBC package is installed.

#devtools::install_github("rstats-db/odbc")
library(DBI)
#connectionString <- "Driver=SQL Server;Server={Servern name here};Database={Database name here};Uid={User Id here};Pwd={Password here};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"

#Set up a channel to the SQL server Database
con <- dbConnect(odbc::odbc(),
  .connection_string = connectionString)

We can check the connection by checking the tables unders a specific schema.

dbListTables(con, schema = "bi" )
## [1] "date"         "geo"          "manufacturer" "product"     
## [5] "salesFact"    "sentiment"

R Notebooks also allow to run SQL chunks directly.

SELECT COUNT(*) FROM bi.geo
1 records
99618

Just another example!

        SELECT TOP 10 sf.ProductID, SUM(sf.Units) as Total, p.Product, p.Category, p.Segment
        FROM bi.salesFact as sf
        INNER JOIN bi.product as p
        ON p.ProductID = sf.ProductID
        GROUP BY sf.ProductID, p.Product, p.Category, p.Segment
Displaying records 1 - 10
ProductID Total Product Category Segment
1 488 Abbas MA-01 Mix All Season
3 8515 Abbas MA-03 Mix All Season
4 1912 Abbas MA-04 Mix All Season
5 18 Abbas MA-05 Mix All Season
7 362 Abbas MA-07 Mix All Season
8 6459 Abbas MA-08 Mix All Season
9 5452 Abbas MA-09 Mix All Season
10 161 Abbas MA-10 Mix All Season
11 17 Abbas MA-11 Mix All Season
12 272 Abbas MA-12 Mix All Season

Creating data frame using odbc connection

qry <- "SELECT sf.ProductID, SUM(sf.Units) as Total, p.Product, p.Category, p.Segment
        FROM bi.salesFact as sf
        INNER JOIN bi.product as p
        ON p.ProductID = sf.ProductID
        GROUP BY sf.ProductID, p.Product, p.Category, p.Segment"

smry <- dbGetQuery(con, qry)

head(smry)
##   ProductID Total     Product Category    Segment
## 1         1   488 Abbas MA-01      Mix All Season
## 2         3  8515 Abbas MA-03      Mix All Season
## 3         4  1912 Abbas MA-04      Mix All Season
## 4         5    18 Abbas MA-05      Mix All Season
## 5         7   362 Abbas MA-07      Mix All Season
## 6         8  6459 Abbas MA-08      Mix All Season
#class(smry)

Below query will display the top selling product for each category

library(sqldf)

qry0 <- "SELECT Category, Product, Total FROM smry AS X 
              WHERE Total >= ALL(SELECT Total FROM smry AS Y WHERE X.Category = Y.Category)"

library(sqldf)
# ALL Keyword not supported by sqldf
#sqldf(qry0)

Same as above

qry1 <- "SELECT Category, Product, Total FROM smry AS X 
              WHERE Total >= (SELECT MAX(Total) FROM smry AS Y WHERE X.Category = Y.Category)"

sqldf(qry1)
##   Category       Product  Total
## 1    Urban Maximus UC-69 120822
## 2    Rural  Natura RP-79 113849
## 3    Rural  Natura RP-80 113849
## 4      Mix   Pirum MA-11  53978
## 5    Youth  Aliqui YY-05  52684

This query will only show category and product total for max product

# This will not show the max product name or category
qry1.5 <- "SELECT Category, max(Total) 
              FROM smry
                GROUP BY Category"

sqldf(qry1.5)
##   Category max(Total)
## 1      Mix      53978
## 2    Rural     113849
## 3    Urban     120822
## 4    Youth      52684

Select Top N products from each category

# Top keywork not recognized in sqldf package
qry3 <- "SELECT Category, Product, Total FROM smry AS X
              WHERE Total IN (SELECT TOP 1 Total FROM smry AS Y
               WHERE X.Category = Y.Category
               ORDER BY Total)"

#sqldf(qry3)

Data Table Demo

Loading the Package and Data Table

library(data.table)
smry <- data.table(smry)

Category wise aggregation of total Units

smry[, .(GrandTotal = sum(Total)), by = Category][order(-GrandTotal)]
##    Category GrandTotal
## 1:    Urban    5961949
## 2:    Rural    3968287
## 3:    Youth     522154
## 4:      Mix     367471

Products by Category

How many products are present is each Category

smry[, .(Products=.N), by = Category][order(-Products)]
##    Category Products
## 1:    Urban     1153
## 2:    Rural      625
## 3:      Mix      157
## 4:    Youth      136

Products by Category and Segment

How many products are present in each Category

smry[, .(Products=.N), by = .(Category, Segment)][order(Category, -Products)]
##    Category      Segment Products
## 1:      Mix   All Season      125
## 2:      Mix Productivity       32
## 3:    Rural Productivity      502
## 4:    Rural       Select      123
## 5:    Urban  Convenience      382
## 6:    Urban      Extreme      317
## 7:    Urban   Moderation      265
## 8:    Urban      Regular      189
## 9:    Youth        Youth      136

Pick up top N Selling Product in Each Category

#smry

N <- 2

smry[order(Category,-Total)][,head(.SD, N), keyby = .(Category)]
##    Category ProductID  Total       Product      Segment
## 1:      Mix      1053  53978   Pirum MA-11   All Season
## 2:      Mix      1916  26257  Currus MA-09   All Season
## 3:    Rural       791 113849  Natura RP-79 Productivity
## 4:    Rural       792 113849  Natura RP-80 Productivity
## 5:    Urban       604 120822 Maximus UC-69  Convenience
## 6:    Urban      2332 107074  Aliqui UE-06      Extreme
## 7:    Youth      2396  52684  Aliqui YY-05        Youth
## 8:    Youth      1000  48719  Natura YY-01        Youth

Mean Sales for each Category

smry[ ,lapply(.SD, mean), by = Category, .SDcols = "Total"]
##    Category    Total
## 1:      Mix 2340.580
## 2:    Rural 6349.259
## 3:    Urban 5170.814
## 4:    Youth 3839.368

Number of Products per Category with Product ID > N

N <-1000
smry[, .N, by = .(Category,ProductID > N)][order(Category,ProductID > N)]
##    Category ProductID   N
## 1:      Mix     FALSE  48
## 2:      Mix      TRUE 109
## 3:    Rural     FALSE 128
## 4:    Rural      TRUE 497
## 5:    Urban     FALSE 683
## 6:    Urban      TRUE 470
## 7:    Youth     FALSE   3
## 8:    Youth      TRUE 133

% Contribution of each Category to total Sales

smry[,sum(Total)/(smry[,sum(Total)]), by = Category]
##    Category         V1
## 1:      Mix 0.03396264
## 2:    Rural 0.36675952
## 3:    Urban 0.55101900
## 4:    Youth 0.04825885

% Contribution of each Segment in Category

#smry1 <- smry[ ,sum(Total), keyby = .(Category, Segment)]
#smry1
#smry1[,V1/sum(V1), keyby = .(Category)]

smry[ ,.(T1 = sum(Total)), keyby = .(Category, Segment)][,.(Segment, P1 = T1/sum(T1)), keyby = .(Category)]
##    Category      Segment         P1
## 1:      Mix   All Season 0.95479915
## 2:      Mix Productivity 0.04520085
## 3:    Rural Productivity 0.86927886
## 4:    Rural       Select 0.13072114
## 5:    Urban  Convenience 0.46002440
## 6:    Urban      Extreme 0.23975951
## 7:    Urban   Moderation 0.26709370
## 8:    Urban      Regular 0.03312239
## 9:    Youth        Youth 1.00000000

% Contribution of each Segment in Category

qry7 <- "SELECT s.Category, s.Segment, s.T1*100/p.T2 AS P1
        FROM (
                SELECT Category, Segment, sum(Total) AS T1
                FROM smry
                GROUP BY Category, Segment) AS s
        JOIN (SELECT Category, SUM(Total) AS T2
        FROM smry
        GROUP BY Category) AS p
        ON p.Category = s.Category
        GROUP BY s.Category, s.Segment"

sqldf(qry7)
##   Category      Segment  P1
## 1      Mix   All Season  95
## 2      Mix Productivity   4
## 3    Rural Productivity  86
## 4    Rural       Select  13
## 5    Urban  Convenience  46
## 6    Urban      Extreme  23
## 7    Urban   Moderation  26
## 8    Urban      Regular   3
## 9    Youth        Youth 100

Selecting Top Nth Product in Each Category

smry[order(Category, -Total)][ , .SD[1], by = Category]
##    Category ProductID  Total       Product      Segment
## 1:      Mix      1053  53978   Pirum MA-11   All Season
## 2:    Rural       791 113849  Natura RP-79 Productivity
## 3:    Urban       604 120822 Maximus UC-69  Convenience
## 4:    Youth      2396  52684  Aliqui YY-05        Youth

Selecting Top Nth Product in Each Category using Ranking

qry8 <- "SELECT Category, Segment, Product, Total, 
                                                (
                                                SELECT COUNT(*)+1
                                                FROM smry as s2
                                                WHERE s2.Category = s1.Category AND s2.Total > s1.Total
                                                ) AS Rnk
        FROM smry AS s1
        WHERE Rnk = 1
        GROUP BY Category, Segment, Product
        ORDER BY Category, Rnk"

sqldf(qry8)
##   Category      Segment       Product  Total Rnk
## 1      Mix   All Season   Pirum MA-11  53978   1
## 2    Rural Productivity  Natura RP-79 113849   1
## 3    Rural Productivity  Natura RP-80 113849   1
## 4    Urban  Convenience Maximus UC-69 120822   1
## 5    Youth        Youth  Aliqui YY-05  52684   1

Data Table reference semantics

smry1 <- smry

smry1[, rank := as.numeric(frank(-Total , ties.method = "min")), by = Category][order(Category, rank)][rank == 1]
##    ProductID  Total       Product Category      Segment rank
## 1:      1053  53978   Pirum MA-11      Mix   All Season    1
## 2:       791 113849  Natura RP-79    Rural Productivity    1
## 3:       792 113849  Natura RP-80    Rural Productivity    1
## 4:       604 120822 Maximus UC-69    Urban  Convenience    1
## 5:      2396  52684  Aliqui YY-05    Youth        Youth    1

Setting Keys

setkey(smry1, Category, Segment)

smry1[.("Mix", "All Season"), sum(Total), by = .(Category, Segment)]
##    Category    Segment     V1
## 1:      Mix All Season 350861

WITH keyword supported in sqldf. % contribution of each Segment in Category

#head(smry)
smry <- smry1[,-"rank"]

qry10 <- "WITH s AS (
                SELECT Category, sum(Total) as Tot
                FROM smry
                GROUP BY Category)
SELECT s1.Category, s1.Segment, sum(s1.Total)*100/s.Tot
FROM smry AS s1
JOIN s
ON s.Category = s1.Category
GROUP BY s1.Category, s1.Segment"

sqldf(qry10)
##   Category      Segment sum(s1.Total)*100/s.Tot
## 1      Mix   All Season                      95
## 2      Mix Productivity                       4
## 3    Rural Productivity                      86
## 4    Rural       Select                      13
## 5    Urban  Convenience                      46
## 6    Urban      Extreme                      23
## 7    Urban   Moderation                      26
## 8    Urban      Regular                       3
## 9    Youth        Youth                     100

Pivoting

dcast(smry, Category ~ Segment, value.var = "Total")
## Aggregate function missing, defaulting to 'length'
##    Category All Season Convenience Extreme Moderation Productivity Regular
## 1:      Mix        125           0       0          0           32       0
## 2:    Rural          0           0       0          0          502       0
## 3:    Urban          0         382     317        265            0     189
## 4:    Youth          0           0       0          0            0       0
##    Select Youth
## 1:      0     0
## 2:    123     0
## 3:      0     0
## 4:      0   136
temp <- dcast(smry, Category ~ Segment, fun.aggregate = function(x) sum(x), value.var = "Total")
temp
##    Category All Season Convenience Extreme Moderation Productivity Regular
## 1:      Mix     350861           0       0          0        16610       0
## 2:    Rural          0           0       0          0      3449548       0
## 3:    Urban          0     2742642 1429434    1592399            0  197474
## 4:    Youth          0           0       0          0            0       0
##    Select  Youth
## 1:      0      0
## 2: 518739      0
## 3:      0      0
## 4:      0 522154

PIVOT Keyword not supported in sqldf

qry11 <- "SELECT * 
        FROM smry 
        PIVOT 
        (
                SUM(Total)
                FOR Segment
                IN ([Extreme], [Regular])
        ) pt"

# sqldf(qry11)


SELECT *
        FROM (
                SELECT SUM(sf.Units) as Total, p.Category, p.Segment
                FROM bi.salesFact as sf
                INNER JOIN bi.product as p
                ON p.ProductID = sf.ProductID
                GROUP BY sf.ProductID, p.Product, p.Category, p.Segment
                ) AS smry
                PIVOT 
                (
                SUM(Total)
                FOR Segment
                IN ([Productivity])
                ) pt
4 records
Category Productivity
Mix 16610
Rural 3449548
Urban NA
Youth NA