SlideShare a Scribd company logo
R meets Hadoop
R meets Hadoop
R meets Hadoop
R meets Hadoop
R meets Hadoop
6
7
8
Blocks (Input Data)




 Parallel Partition         Node 1   Node 2   Node 3
      (MAP)


         Network Transfer



Parallel Recombine          Node 1   Node 2   Node 3
    (REDUCE)




   Output Data
                                                       9
10
11
12
13
14
•




R CMD INSTALL Rhipe_version.tar.gz




                                     15
map <- expression({
    #
})




reduce <-   expression(
   pre      = {},
   reduce   = {},
   post     = {}
)




z <- rhmr(map=map, reduce=reduce, inout=c("text","sequence")
         ,ifolder=”/tmp/input”, ofolder=”/tmp/output”)
rhex(z)




results <- rhread(“/tmp/output”)

                                                               16
map <- expression({
   library(openNLP)
   f <- table(tokenize(unlist(map.values), language = "en"))
   n <- names(f)
   p <- as.numeric(f)
   sapply(seq_along(n),function(r) rhcollect(n[r],p[r]))
})




reduce <- expression(
    pre    = { total <- 0},
    reduce = { total <- total+sum(unlist(reduce.values)) },
    post   = { rhcollect(reduce.key,total) }
)




z <- rhmr(map=map, reduce=reduce, inout=c("text","sequence")
         ,ifolder=”/tmp/input”, ofolder=”/tmp/output”)
rhex(z)



                                                               17
>  results <- rhread("/tmp/output")
>  results <- data.frame(word=unlist(lapply(results,"[[",1))’
+                       ,count =unlist (lapply(results,"[[",2)))
>  results <- (results[order(results$count, decreasing=TRUE), ])
>  head(results)
     word count
13      . 2080
439 the 1101
11      ,   760
32      a   701
153    to   658
28      I   651
> results[results["word"] == "FACEBOOK", ]
          word count
3221 FACEBOOK      6
> results[results["word"] == "Facebook", ]
          word count
3223 Facebook     39
> results[results["word"] == "facebook", ]
          word count
3389 facebook      6



                                                                   18
19
map <- expression({
  msys <- function(on){
    system(sprintf("wget %s --directory-prefix ./tmp 2> ./errors",on))
    if(length(grep("(failed)|(unable)",readLines("./errors")))>0){
       stop(paste(readLines("./errors"),collapse="n"))
    }}

  lapply(map.values,function(x){
    x=1986+x
    on <- sprintf("http://stat-computing.org/dataexpo/2009/%s.csv.bz2",x)
    fn <- sprintf("./tmp/%s.csv.bz2",x)
    rhstatus(sprintf("Downloading %s", on))
    msys(on)
    rhstatus(sprintf("Downloaded %s", on))
    system(sprintf('bunzip2 %s',fn))
    rhstatus(sprintf("Unzipped %s", on))
    rhcounter("FILES",x,1)
    rhcounter("FILES","_ALL_",1)
  })
})
z <- rhmr(map=map,ofolder="/airline/data",inout=c("lapply"), N=length(1987:2008),
          mapred=list(mapred.reduce.tasks=0,mapred.task.timeout=0),copyFiles=TRUE)
j <- rhex(z,async=TRUE)


                                                                                20
setup <- expression({
  convertHHMM <- function(s){
     t(sapply(s,function(r){
        l=nchar(r)
        if(l==4) c(substr(r,1,2),substr(r,3,4))
        else if(l==3) c(substr(r,1,1),substr(r,2,3))
        else c('0','0')
     })
  )}
})
map <- expression({
  y <- do.call("rbind",lapply(map.values,function(r){
     if(substr(r,1,4)!='Year') strsplit(r,",")[[1]]
  }))
  mu <- rep(1,nrow(y));yr <- y[,1]; mn=y[,2];dy=y[,3]
  hr <- convertHHMM(y[,5])
  depart <- ISOdatetime(year=yr,month=mn,day=dy,hour=hr[,1],min=hr[,2],sec=mu)
  hr <- convertHHMM(y[,6])
  sdepart <- ISOdatetime(year=yr,month=mn,day=dy,hour=hr[,1],min=hr[,2],sec=mu)
  hr <- convertHHMM(y[,7])
  arrive <- ISOdatetime(year=yr,month=mn,day=dy,hour=hr[,1],min=hr[,2],sec=mu)
  hr <- convertHHMM(y[,8])
  sarrive <- ISOdatetime(year=yr,month=mn,day=dy,hour=hr[,1],min=hr[,2],sec=mu)
  d <- data.frame(depart= depart,sdepart = sdepart, arrive = arrive,sarrive =sarrive
                   ,carrier = y[,9],origin = y[,17], dest=y[,18],dist = y[,19], year=yr, month=mn, day=dy
                   ,cancelled=y[,22], stringsAsFactors=FALSE)
  d <- d[order(d$sdepart),]
  rhcollect(d[c(1,nrow(d)),"sdepart"],d)
})
reduce <- expression(
     reduce = { lapply(reduce.values, function(i) rhcollect(reduce.key,i))}
     )
z <- rhmr(map=map,reduce=reduce,setup=setup,inout=c("text","sequence")
            ,ifolder="/airline/data/",ofolder="/airline/blocks",mapred=mapred,orderby="numeric")            21
rhex(z)
map <- expression({
   a <- do.call("rbind",map.values)
   inbound <- table(a[,'origin'])
   outbound <- table(a[,'dest'])
   total <- table(unlist(c(a[,'origin'],a['dest'])))
   for (n in names(total)) {
     inb <- if(is.na(inbound[n])) 0 else inbound[n]
     ob <- if(is.na(outbound[n])) 0 else outbound[n]
     rhcollect(n, c(inb,ob, total[n]))
   }
})


reduce <- expression(
    pre    = { sums <- c(0,0,0) },
    reduce = { sums <- sums+apply(do.call("rbind",reduce.values),2,sum) },
    post   = { rhcollect(reduce.key, sums) }
    )


z <- rhmr(map=map,reduce=reduce,combiner=TRUE,inout=c("sequence","sequence")
          ,ifolder="/airline/blocks/",ofolder="/airline/volume")
rhex(z,async=TRUE)


                                                                               22
>  counts <- rhread("/airline/volume")
>  aircode <- unlist(lapply(counts, "[[",1))
>  count <- do.call("rbind",lapply(counts,"[[",2))
>  results <- data.frame(aircode=aircode,
+                         inb=count[,1],oub=count[,2],all=count[,3]
+                         ,stringsAsFactors=FALSE)
>  results <- results[order(results$all,decreasing=TRUE),]
>  ap <- read.table("~/tmp/airports.csv",sep=",",header=TRUE,
+                    stringsAsFactors=FALSE,na.strings="XYZB")
>  results$airport <- sapply(results$aircode,function(r){
+      nam <- ap[ap$iata==r,'airport']
+      if(length(nam)==0) r else nam
+  })
>  results[1:10,]
     aircode      inb     oub      all                           airport
243      ORD 6597442 6638035 13235477       Chicago O'Hare International
21       ATL 6100953 6094186 12195139 William B Hartsfield-Atlanta Intl
91       DFW 5710980 5745593 11456573    Dallas-Fort Worth International
182      LAX 4089012 4086930 8175942           Los Angeles International
254      PHX 3491077 3497764 6988841 Phoenix Sky Harbor International
89       DEN 3319905 3335222 6655127                         Denver Intl
97       DTW 2979158 2997138 5976296 Detroit Metropolitan-Wayne County
156      IAH 2884518 2889971 5774489        George Bush Intercontinental
230      MSP 2754997 2765191 5520188            Minneapolis-St Paul Intl
300      SFO 2733910 2725676 5459586         San Francisco International
                                                                           23
R meets Hadoop
map <- expression({
   a <- do.call("rbind",map.values)
   y <- table(apply(a[,c("origin","dest")],1,function(r){
     paste(sort(r),collapse=",")
   }))
   for(i in 1:length(y)){
     p <- strsplit(names(y)[[i]],",")[[1]]
     rhcollect(p,y[[1]])
   }
})


reduce <- expression(
    pre    = {sums <- 0},
    reduce = {sums <- sums+sum(unlist(reduce.values))},
    post   = { rhcollect(reduce.key, sums) }
)


z <- rhmr(map=map,reduce=reduce,combiner=TRUE,inout=c("sequence","sequence")
          ,ifolder="/airline/blocks/",ofolder="/airline/ijjoin")
z=rhex(z)




                                                                               25
> b=rhread("/airline/ijjoin")
> y <- do.call("rbind",lapply(b,"[[",1))
> results <- data.frame(a=y[,1],b=y[,2],count=
+            do.call("rbind",lapply(b,"[[",2)),stringsAsFactors=FALSE)
> results <- results[order(results$count,decreasing=TRUE),]
> results$cumprop <- cumsum(results$count)/sum(results$count)
> a.lat <- t(sapply(results$a,function(r){
+   ap[ap$iata==r,c('lat','long')]
+ }))
> results$a.lat <- unlist(a.lat[,'lat'])
> results$a.long <- unlist(a.lat[,'long'])
> b.lat <- t(sapply(results$b,function(r){
+   ap[ap$iata==r,c('lat','long')]
+ }))
> b.lat["CBM",] <- c(0,0)
> results$b.lat <- unlist(b.lat[,'lat'])
> results$b.long <- unlist(b.lat[,'long'])
> head(results)
       a   b count      cumprop    a.lat     a.long    b.lat     b.long
418 ATL ORD 141465 0.001546158 33.64044 -84.42694 41.97960 -87.90446
2079 DEN DFW 138892 0.003064195 39.85841 -104.66700 32.89595 -97.03720
331 ATL DFW 135357 0.004543595 33.64044 -84.42694 32.89595 -97.03720
2221 DFW IAH 134508 0.006013716 32.89595 -97.03720 29.98047 -95.33972
3568 LAS LAX 132333 0.007460065 36.08036 -115.15233 33.94254 -118.40807
2409 DTW ORD 130065 0.008881626 42.21206 -83.34884 41.97960 -87.90446
                                                                          26
27
28

More Related Content

What's hot (20)

Mosaic plot in R.
Mosaic plot in R.Mosaic plot in R.
Mosaic plot in R.
Dr. Volkan OBAN
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
Dr. Volkan OBAN
 
ggplot2 extensions-ggtree.
ggplot2 extensions-ggtree.ggplot2 extensions-ggtree.
ggplot2 extensions-ggtree.
Dr. Volkan OBAN
 
CLIM Undergraduate Workshop: Tutorial on R Software - Huang Huang, Oct 23, 2017
CLIM Undergraduate Workshop: Tutorial on R Software - Huang Huang, Oct 23, 2017CLIM Undergraduate Workshop: Tutorial on R Software - Huang Huang, Oct 23, 2017
CLIM Undergraduate Workshop: Tutorial on R Software - Huang Huang, Oct 23, 2017
The Statistical and Applied Mathematical Sciences Institute
 
C++ TUTORIAL 6
C++ TUTORIAL 6C++ TUTORIAL 6
C++ TUTORIAL 6
Farhan Ab Rahman
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changes
hayato
 
C++ TUTORIAL 7
C++ TUTORIAL 7C++ TUTORIAL 7
C++ TUTORIAL 7
Farhan Ab Rahman
 
Plot3D Package and Example in R.-Data visualizat,on
Plot3D Package and Example in R.-Data visualizat,onPlot3D Package and Example in R.-Data visualizat,on
Plot3D Package and Example in R.-Data visualizat,on
Dr. Volkan OBAN
 
Python hmm
Python hmmPython hmm
Python hmm
立民 林
 
Effector: we need to go deeper
Effector: we need to go deeperEffector: we need to go deeper
Effector: we need to go deeper
Victor Didenko
 
Om (Cont.)
Om (Cont.)Om (Cont.)
Om (Cont.)
Taku Fukushima
 
C++ TUTORIAL 10
C++ TUTORIAL 10C++ TUTORIAL 10
C++ TUTORIAL 10
Farhan Ab Rahman
 
Angular Refactoring in Real World
Angular Refactoring in Real WorldAngular Refactoring in Real World
Angular Refactoring in Real World
bitbank, Inc. Tokyo, Japan
 
Hacking the Internet of Things for Fun & Profit
Hacking the Internet of Things for Fun & ProfitHacking the Internet of Things for Fun & Profit
Hacking the Internet of Things for Fun & Profit
Ruben van Vreeland
 
C++ TUTORIAL 3
C++ TUTORIAL 3C++ TUTORIAL 3
C++ TUTORIAL 3
Farhan Ab Rahman
 
C++ TUTORIAL 9
C++ TUTORIAL 9C++ TUTORIAL 9
C++ TUTORIAL 9
Farhan Ab Rahman
 
Camping
CampingCamping
Camping
Gregor Schmidt
 
dplyr
dplyrdplyr
dplyr
Romain Francois
 
Metaprogramming
MetaprogrammingMetaprogramming
Metaprogramming
Dmitri Nesteruk
 
Data aggregation in R
Data aggregation in RData aggregation in R
Data aggregation in R
Andrija Djurovic
 

Similar to R meets Hadoop (20)

Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
Dmitry Buzdin
 
All I know about rsc.io/c2go
All I know about rsc.io/c2goAll I know about rsc.io/c2go
All I know about rsc.io/c2go
Moriyoshi Koizumi
 
R (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support SystemR (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support System
Maithreya Chakravarthula
 
Joclad 2010 d
Joclad 2010 dJoclad 2010 d
Joclad 2010 d
a1000caroliveira
 
BOXPLOT EXAMPLES in R And An Example for BEESWARM:
BOXPLOT EXAMPLES in R And  An Example for BEESWARM:BOXPLOT EXAMPLES in R And  An Example for BEESWARM:
BOXPLOT EXAMPLES in R And An Example for BEESWARM:
Dr. Volkan OBAN
 
Big Data Analytics with Scala at SCALA.IO 2013
Big Data Analytics with Scala at SCALA.IO 2013Big Data Analytics with Scala at SCALA.IO 2013
Big Data Analytics with Scala at SCALA.IO 2013
Samir Bessalah
 
dplyr use case
dplyr use casedplyr use case
dplyr use case
Romain Francois
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
Sander Kieft
 
Jan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoopJan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoop
Yahoo Developer Network
 
From Javascript To Haskell
From Javascript To HaskellFrom Javascript To Haskell
From Javascript To Haskell
ujihisa
 
Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%
David LeBauer
 
Hadoop I/O Analysis
Hadoop I/O AnalysisHadoop I/O Analysis
Hadoop I/O Analysis
Richard McDougall
 
オープンデータを使ったモバイルアプリ開発(応用編)
オープンデータを使ったモバイルアプリ開発(応用編)オープンデータを使ったモバイルアプリ開発(応用編)
オープンデータを使ったモバイルアプリ開発(応用編)
Takayuki Goto
 
Implementing Software Machines in Go and C
Implementing Software Machines in Go and CImplementing Software Machines in Go and C
Implementing Software Machines in Go and C
Eleanor McHugh
 
ZeroMQ: Messaging Made Simple
ZeroMQ: Messaging Made SimpleZeroMQ: Messaging Made Simple
ZeroMQ: Messaging Made Simple
Ian Barber
 
Scilab presentation
Scilab presentation Scilab presentation
Scilab presentation
Nasir Ansari
 
R programming language
R programming languageR programming language
R programming language
Alberto Minetti
 
Артём Акуляков - F# for Data Analysis
Артём Акуляков - F# for Data AnalysisАртём Акуляков - F# for Data Analysis
Артём Акуляков - F# for Data Analysis
SpbDotNet Community
 
Using R for Building a Simple and Effective Dashboard
Using R for Building a Simple and Effective DashboardUsing R for Building a Simple and Effective Dashboard
Using R for Building a Simple and Effective Dashboard
Andrea Gigli
 
Clojure to Slang
Clojure to SlangClojure to Slang
Clojure to Slang
Magne Gåsland
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
Dmitry Buzdin
 
All I know about rsc.io/c2go
All I know about rsc.io/c2goAll I know about rsc.io/c2go
All I know about rsc.io/c2go
Moriyoshi Koizumi
 
R (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support SystemR (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support System
Maithreya Chakravarthula
 
BOXPLOT EXAMPLES in R And An Example for BEESWARM:
BOXPLOT EXAMPLES in R And  An Example for BEESWARM:BOXPLOT EXAMPLES in R And  An Example for BEESWARM:
BOXPLOT EXAMPLES in R And An Example for BEESWARM:
Dr. Volkan OBAN
 
Big Data Analytics with Scala at SCALA.IO 2013
Big Data Analytics with Scala at SCALA.IO 2013Big Data Analytics with Scala at SCALA.IO 2013
Big Data Analytics with Scala at SCALA.IO 2013
Samir Bessalah
 
From Javascript To Haskell
From Javascript To HaskellFrom Javascript To Haskell
From Javascript To Haskell
ujihisa
 
Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%
David LeBauer
 
オープンデータを使ったモバイルアプリ開発(応用編)
オープンデータを使ったモバイルアプリ開発(応用編)オープンデータを使ったモバイルアプリ開発(応用編)
オープンデータを使ったモバイルアプリ開発(応用編)
Takayuki Goto
 
Implementing Software Machines in Go and C
Implementing Software Machines in Go and CImplementing Software Machines in Go and C
Implementing Software Machines in Go and C
Eleanor McHugh
 
ZeroMQ: Messaging Made Simple
ZeroMQ: Messaging Made SimpleZeroMQ: Messaging Made Simple
ZeroMQ: Messaging Made Simple
Ian Barber
 
Scilab presentation
Scilab presentation Scilab presentation
Scilab presentation
Nasir Ansari
 
Артём Акуляков - F# for Data Analysis
Артём Акуляков - F# for Data AnalysisАртём Акуляков - F# for Data Analysis
Артём Акуляков - F# for Data Analysis
SpbDotNet Community
 
Using R for Building a Simple and Effective Dashboard
Using R for Building a Simple and Effective DashboardUsing R for Building a Simple and Effective Dashboard
Using R for Building a Simple and Effective Dashboard
Andrea Gigli
 

More from Hidekazu Tanaka (10)

ggplot2 に入門してみた
ggplot2 に入門してみたggplot2 に入門してみた
ggplot2 に入門してみた
Hidekazu Tanaka
 
データベースのお話
データベースのお話データベースのお話
データベースのお話
Hidekazu Tanaka
 
バギングで構築された各決定木
バギングで構築された各決定木バギングで構築された各決定木
バギングで構築された各決定木
Hidekazu Tanaka
 
アンサンブル学習
アンサンブル学習アンサンブル学習
アンサンブル学習
Hidekazu Tanaka
 
Rの紹介
Rの紹介Rの紹介
Rの紹介
Hidekazu Tanaka
 
Rで解く最適化問題 線型計画問題編
Rで解く最適化問題   線型計画問題編 Rで解く最適化問題   線型計画問題編
Rで解く最適化問題 線型計画問題編
Hidekazu Tanaka
 
RでMapreduce
RでMapreduceRでMapreduce
RでMapreduce
Hidekazu Tanaka
 
Rによるやさしい統計学 第16章 : 因子分析
Rによるやさしい統計学 第16章 : 因子分析Rによるやさしい統計学 第16章 : 因子分析
Rによるやさしい統計学 第16章 : 因子分析
Hidekazu Tanaka
 
ggplot2 に入門してみた
ggplot2 に入門してみたggplot2 に入門してみた
ggplot2 に入門してみた
Hidekazu Tanaka
 
データベースのお話
データベースのお話データベースのお話
データベースのお話
Hidekazu Tanaka
 
バギングで構築された各決定木
バギングで構築された各決定木バギングで構築された各決定木
バギングで構築された各決定木
Hidekazu Tanaka
 
アンサンブル学習
アンサンブル学習アンサンブル学習
アンサンブル学習
Hidekazu Tanaka
 
Rで解く最適化問題 線型計画問題編
Rで解く最適化問題   線型計画問題編 Rで解く最適化問題   線型計画問題編
Rで解く最適化問題 線型計画問題編
Hidekazu Tanaka
 
Rによるやさしい統計学 第16章 : 因子分析
Rによるやさしい統計学 第16章 : 因子分析Rによるやさしい統計学 第16章 : 因子分析
Rによるやさしい統計学 第16章 : 因子分析
Hidekazu Tanaka
 

Recently uploaded (20)

Procurement Insights Cost To Value Guide.pptx
Procurement Insights Cost To Value Guide.pptxProcurement Insights Cost To Value Guide.pptx
Procurement Insights Cost To Value Guide.pptx
Jon Hansen
 
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul
 
Mobile App Development Company in Saudi Arabia
Mobile App Development Company in Saudi ArabiaMobile App Development Company in Saudi Arabia
Mobile App Development Company in Saudi Arabia
Steve Jonas
 
Build Your Own Copilot & Agents For Devs
Build Your Own Copilot & Agents For DevsBuild Your Own Copilot & Agents For Devs
Build Your Own Copilot & Agents For Devs
Brian McKeiver
 
Technology Trends in 2025: AI and Big Data Analytics
Technology Trends in 2025: AI and Big Data AnalyticsTechnology Trends in 2025: AI and Big Data Analytics
Technology Trends in 2025: AI and Big Data Analytics
InData Labs
 
Special Meetup Edition - TDX Bengaluru Meetup #52.pptx
Special Meetup Edition - TDX Bengaluru Meetup #52.pptxSpecial Meetup Edition - TDX Bengaluru Meetup #52.pptx
Special Meetup Edition - TDX Bengaluru Meetup #52.pptx
shyamraj55
 
AI and Data Privacy in 2025: Global Trends
AI and Data Privacy in 2025: Global TrendsAI and Data Privacy in 2025: Global Trends
AI and Data Privacy in 2025: Global Trends
InData Labs
 
HCL Nomad Web – Best Practices and Managing Multiuser Environments
HCL Nomad Web – Best Practices and Managing Multiuser EnvironmentsHCL Nomad Web – Best Practices and Managing Multiuser Environments
HCL Nomad Web – Best Practices and Managing Multiuser Environments
panagenda
 
Quantum Computing Quick Research Guide by Arthur Morgan
Quantum Computing Quick Research Guide by Arthur MorganQuantum Computing Quick Research Guide by Arthur Morgan
Quantum Computing Quick Research Guide by Arthur Morgan
Arthur Morgan
 
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-UmgebungenHCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
panagenda
 
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc
 
2025-05-Q4-2024-Investor-Presentation.pptx
2025-05-Q4-2024-Investor-Presentation.pptx2025-05-Q4-2024-Investor-Presentation.pptx
2025-05-Q4-2024-Investor-Presentation.pptx
Samuele Fogagnolo
 
Massive Power Outage Hits Spain, Portugal, and France: Causes, Impact, and On...
Massive Power Outage Hits Spain, Portugal, and France: Causes, Impact, and On...Massive Power Outage Hits Spain, Portugal, and France: Causes, Impact, and On...
Massive Power Outage Hits Spain, Portugal, and France: Causes, Impact, and On...
Aqusag Technologies
 
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
BookNet Canada
 
Big Data Analytics Quick Research Guide by Arthur Morgan
Big Data Analytics Quick Research Guide by Arthur MorganBig Data Analytics Quick Research Guide by Arthur Morgan
Big Data Analytics Quick Research Guide by Arthur Morgan
Arthur Morgan
 
Generative Artificial Intelligence (GenAI) in Business
Generative Artificial Intelligence (GenAI) in BusinessGenerative Artificial Intelligence (GenAI) in Business
Generative Artificial Intelligence (GenAI) in Business
Dr. Tathagat Varma
 
Increasing Retail Store Efficiency How can Planograms Save Time and Money.pptx
Increasing Retail Store Efficiency How can Planograms Save Time and Money.pptxIncreasing Retail Store Efficiency How can Planograms Save Time and Money.pptx
Increasing Retail Store Efficiency How can Planograms Save Time and Money.pptx
Anoop Ashok
 
How analogue intelligence complements AI
How analogue intelligence complements AIHow analogue intelligence complements AI
How analogue intelligence complements AI
Paul Rowe
 
Manifest Pre-Seed Update | A Humanoid OEM Deeptech In France
Manifest Pre-Seed Update | A Humanoid OEM Deeptech In FranceManifest Pre-Seed Update | A Humanoid OEM Deeptech In France
Manifest Pre-Seed Update | A Humanoid OEM Deeptech In France
chb3
 
Cyber Awareness overview for 2025 month of security
Cyber Awareness overview for 2025 month of securityCyber Awareness overview for 2025 month of security
Cyber Awareness overview for 2025 month of security
riccardosl1
 
Procurement Insights Cost To Value Guide.pptx
Procurement Insights Cost To Value Guide.pptxProcurement Insights Cost To Value Guide.pptx
Procurement Insights Cost To Value Guide.pptx
Jon Hansen
 
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul Shares 5 Steps to Implement AI Agents for Maximum Business Efficien...
Noah Loul
 
Mobile App Development Company in Saudi Arabia
Mobile App Development Company in Saudi ArabiaMobile App Development Company in Saudi Arabia
Mobile App Development Company in Saudi Arabia
Steve Jonas
 
Build Your Own Copilot & Agents For Devs
Build Your Own Copilot & Agents For DevsBuild Your Own Copilot & Agents For Devs
Build Your Own Copilot & Agents For Devs
Brian McKeiver
 
Technology Trends in 2025: AI and Big Data Analytics
Technology Trends in 2025: AI and Big Data AnalyticsTechnology Trends in 2025: AI and Big Data Analytics
Technology Trends in 2025: AI and Big Data Analytics
InData Labs
 
Special Meetup Edition - TDX Bengaluru Meetup #52.pptx
Special Meetup Edition - TDX Bengaluru Meetup #52.pptxSpecial Meetup Edition - TDX Bengaluru Meetup #52.pptx
Special Meetup Edition - TDX Bengaluru Meetup #52.pptx
shyamraj55
 
AI and Data Privacy in 2025: Global Trends
AI and Data Privacy in 2025: Global TrendsAI and Data Privacy in 2025: Global Trends
AI and Data Privacy in 2025: Global Trends
InData Labs
 
HCL Nomad Web – Best Practices and Managing Multiuser Environments
HCL Nomad Web – Best Practices and Managing Multiuser EnvironmentsHCL Nomad Web – Best Practices and Managing Multiuser Environments
HCL Nomad Web – Best Practices and Managing Multiuser Environments
panagenda
 
Quantum Computing Quick Research Guide by Arthur Morgan
Quantum Computing Quick Research Guide by Arthur MorganQuantum Computing Quick Research Guide by Arthur Morgan
Quantum Computing Quick Research Guide by Arthur Morgan
Arthur Morgan
 
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-UmgebungenHCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
panagenda
 
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc Webinar: Consumer Expectations vs Corporate Realities on Data Broker...
TrustArc
 
2025-05-Q4-2024-Investor-Presentation.pptx
2025-05-Q4-2024-Investor-Presentation.pptx2025-05-Q4-2024-Investor-Presentation.pptx
2025-05-Q4-2024-Investor-Presentation.pptx
Samuele Fogagnolo
 
Massive Power Outage Hits Spain, Portugal, and France: Causes, Impact, and On...
Massive Power Outage Hits Spain, Portugal, and France: Causes, Impact, and On...Massive Power Outage Hits Spain, Portugal, and France: Causes, Impact, and On...
Massive Power Outage Hits Spain, Portugal, and France: Causes, Impact, and On...
Aqusag Technologies
 
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
Transcript: #StandardsGoals for 2025: Standards & certification roundup - Tec...
BookNet Canada
 
Big Data Analytics Quick Research Guide by Arthur Morgan
Big Data Analytics Quick Research Guide by Arthur MorganBig Data Analytics Quick Research Guide by Arthur Morgan
Big Data Analytics Quick Research Guide by Arthur Morgan
Arthur Morgan
 
Generative Artificial Intelligence (GenAI) in Business
Generative Artificial Intelligence (GenAI) in BusinessGenerative Artificial Intelligence (GenAI) in Business
Generative Artificial Intelligence (GenAI) in Business
Dr. Tathagat Varma
 
Increasing Retail Store Efficiency How can Planograms Save Time and Money.pptx
Increasing Retail Store Efficiency How can Planograms Save Time and Money.pptxIncreasing Retail Store Efficiency How can Planograms Save Time and Money.pptx
Increasing Retail Store Efficiency How can Planograms Save Time and Money.pptx
Anoop Ashok
 
How analogue intelligence complements AI
How analogue intelligence complements AIHow analogue intelligence complements AI
How analogue intelligence complements AI
Paul Rowe
 
Manifest Pre-Seed Update | A Humanoid OEM Deeptech In France
Manifest Pre-Seed Update | A Humanoid OEM Deeptech In FranceManifest Pre-Seed Update | A Humanoid OEM Deeptech In France
Manifest Pre-Seed Update | A Humanoid OEM Deeptech In France
chb3
 
Cyber Awareness overview for 2025 month of security
Cyber Awareness overview for 2025 month of securityCyber Awareness overview for 2025 month of security
Cyber Awareness overview for 2025 month of security
riccardosl1
 

R meets Hadoop

  • 6. 6
  • 7. 7
  • 8. 8
  • 9. Blocks (Input Data) Parallel Partition Node 1 Node 2 Node 3 (MAP) Network Transfer Parallel Recombine Node 1 Node 2 Node 3 (REDUCE) Output Data 9
  • 10. 10
  • 11. 11
  • 12. 12
  • 13. 13
  • 14. 14
  • 15. • R CMD INSTALL Rhipe_version.tar.gz 15
  • 16. map <- expression({ # }) reduce <- expression( pre = {}, reduce = {}, post = {} ) z <- rhmr(map=map, reduce=reduce, inout=c("text","sequence") ,ifolder=”/tmp/input”, ofolder=”/tmp/output”) rhex(z) results <- rhread(“/tmp/output”) 16
  • 17. map <- expression({ library(openNLP) f <- table(tokenize(unlist(map.values), language = "en")) n <- names(f) p <- as.numeric(f) sapply(seq_along(n),function(r) rhcollect(n[r],p[r])) }) reduce <- expression( pre = { total <- 0}, reduce = { total <- total+sum(unlist(reduce.values)) }, post = { rhcollect(reduce.key,total) } ) z <- rhmr(map=map, reduce=reduce, inout=c("text","sequence") ,ifolder=”/tmp/input”, ofolder=”/tmp/output”) rhex(z) 17
  • 18. > results <- rhread("/tmp/output") > results <- data.frame(word=unlist(lapply(results,"[[",1))’ + ,count =unlist (lapply(results,"[[",2))) > results <- (results[order(results$count, decreasing=TRUE), ]) > head(results) word count 13 . 2080 439 the 1101 11 , 760 32 a 701 153 to 658 28 I 651 > results[results["word"] == "FACEBOOK", ] word count 3221 FACEBOOK 6 > results[results["word"] == "Facebook", ] word count 3223 Facebook 39 > results[results["word"] == "facebook", ] word count 3389 facebook 6 18
  • 19. 19
  • 20. map <- expression({ msys <- function(on){ system(sprintf("wget %s --directory-prefix ./tmp 2> ./errors",on)) if(length(grep("(failed)|(unable)",readLines("./errors")))>0){ stop(paste(readLines("./errors"),collapse="n")) }} lapply(map.values,function(x){ x=1986+x on <- sprintf("http://stat-computing.org/dataexpo/2009/%s.csv.bz2",x) fn <- sprintf("./tmp/%s.csv.bz2",x) rhstatus(sprintf("Downloading %s", on)) msys(on) rhstatus(sprintf("Downloaded %s", on)) system(sprintf('bunzip2 %s',fn)) rhstatus(sprintf("Unzipped %s", on)) rhcounter("FILES",x,1) rhcounter("FILES","_ALL_",1) }) }) z <- rhmr(map=map,ofolder="/airline/data",inout=c("lapply"), N=length(1987:2008), mapred=list(mapred.reduce.tasks=0,mapred.task.timeout=0),copyFiles=TRUE) j <- rhex(z,async=TRUE) 20
  • 21. setup <- expression({ convertHHMM <- function(s){ t(sapply(s,function(r){ l=nchar(r) if(l==4) c(substr(r,1,2),substr(r,3,4)) else if(l==3) c(substr(r,1,1),substr(r,2,3)) else c('0','0') }) )} }) map <- expression({ y <- do.call("rbind",lapply(map.values,function(r){ if(substr(r,1,4)!='Year') strsplit(r,",")[[1]] })) mu <- rep(1,nrow(y));yr <- y[,1]; mn=y[,2];dy=y[,3] hr <- convertHHMM(y[,5]) depart <- ISOdatetime(year=yr,month=mn,day=dy,hour=hr[,1],min=hr[,2],sec=mu) hr <- convertHHMM(y[,6]) sdepart <- ISOdatetime(year=yr,month=mn,day=dy,hour=hr[,1],min=hr[,2],sec=mu) hr <- convertHHMM(y[,7]) arrive <- ISOdatetime(year=yr,month=mn,day=dy,hour=hr[,1],min=hr[,2],sec=mu) hr <- convertHHMM(y[,8]) sarrive <- ISOdatetime(year=yr,month=mn,day=dy,hour=hr[,1],min=hr[,2],sec=mu) d <- data.frame(depart= depart,sdepart = sdepart, arrive = arrive,sarrive =sarrive ,carrier = y[,9],origin = y[,17], dest=y[,18],dist = y[,19], year=yr, month=mn, day=dy ,cancelled=y[,22], stringsAsFactors=FALSE) d <- d[order(d$sdepart),] rhcollect(d[c(1,nrow(d)),"sdepart"],d) }) reduce <- expression( reduce = { lapply(reduce.values, function(i) rhcollect(reduce.key,i))} ) z <- rhmr(map=map,reduce=reduce,setup=setup,inout=c("text","sequence") ,ifolder="/airline/data/",ofolder="/airline/blocks",mapred=mapred,orderby="numeric") 21 rhex(z)
  • 22. map <- expression({ a <- do.call("rbind",map.values) inbound <- table(a[,'origin']) outbound <- table(a[,'dest']) total <- table(unlist(c(a[,'origin'],a['dest']))) for (n in names(total)) { inb <- if(is.na(inbound[n])) 0 else inbound[n] ob <- if(is.na(outbound[n])) 0 else outbound[n] rhcollect(n, c(inb,ob, total[n])) } }) reduce <- expression( pre = { sums <- c(0,0,0) }, reduce = { sums <- sums+apply(do.call("rbind",reduce.values),2,sum) }, post = { rhcollect(reduce.key, sums) } ) z <- rhmr(map=map,reduce=reduce,combiner=TRUE,inout=c("sequence","sequence") ,ifolder="/airline/blocks/",ofolder="/airline/volume") rhex(z,async=TRUE) 22
  • 23. > counts <- rhread("/airline/volume") > aircode <- unlist(lapply(counts, "[[",1)) > count <- do.call("rbind",lapply(counts,"[[",2)) > results <- data.frame(aircode=aircode, + inb=count[,1],oub=count[,2],all=count[,3] + ,stringsAsFactors=FALSE) > results <- results[order(results$all,decreasing=TRUE),] > ap <- read.table("~/tmp/airports.csv",sep=",",header=TRUE, + stringsAsFactors=FALSE,na.strings="XYZB") > results$airport <- sapply(results$aircode,function(r){ + nam <- ap[ap$iata==r,'airport'] + if(length(nam)==0) r else nam + }) > results[1:10,] aircode inb oub all airport 243 ORD 6597442 6638035 13235477 Chicago O'Hare International 21 ATL 6100953 6094186 12195139 William B Hartsfield-Atlanta Intl 91 DFW 5710980 5745593 11456573 Dallas-Fort Worth International 182 LAX 4089012 4086930 8175942 Los Angeles International 254 PHX 3491077 3497764 6988841 Phoenix Sky Harbor International 89 DEN 3319905 3335222 6655127 Denver Intl 97 DTW 2979158 2997138 5976296 Detroit Metropolitan-Wayne County 156 IAH 2884518 2889971 5774489 George Bush Intercontinental 230 MSP 2754997 2765191 5520188 Minneapolis-St Paul Intl 300 SFO 2733910 2725676 5459586 San Francisco International 23
  • 25. map <- expression({ a <- do.call("rbind",map.values) y <- table(apply(a[,c("origin","dest")],1,function(r){ paste(sort(r),collapse=",") })) for(i in 1:length(y)){ p <- strsplit(names(y)[[i]],",")[[1]] rhcollect(p,y[[1]]) } }) reduce <- expression( pre = {sums <- 0}, reduce = {sums <- sums+sum(unlist(reduce.values))}, post = { rhcollect(reduce.key, sums) } ) z <- rhmr(map=map,reduce=reduce,combiner=TRUE,inout=c("sequence","sequence") ,ifolder="/airline/blocks/",ofolder="/airline/ijjoin") z=rhex(z) 25
  • 26. > b=rhread("/airline/ijjoin") > y <- do.call("rbind",lapply(b,"[[",1)) > results <- data.frame(a=y[,1],b=y[,2],count= + do.call("rbind",lapply(b,"[[",2)),stringsAsFactors=FALSE) > results <- results[order(results$count,decreasing=TRUE),] > results$cumprop <- cumsum(results$count)/sum(results$count) > a.lat <- t(sapply(results$a,function(r){ + ap[ap$iata==r,c('lat','long')] + })) > results$a.lat <- unlist(a.lat[,'lat']) > results$a.long <- unlist(a.lat[,'long']) > b.lat <- t(sapply(results$b,function(r){ + ap[ap$iata==r,c('lat','long')] + })) > b.lat["CBM",] <- c(0,0) > results$b.lat <- unlist(b.lat[,'lat']) > results$b.long <- unlist(b.lat[,'long']) > head(results) a b count cumprop a.lat a.long b.lat b.long 418 ATL ORD 141465 0.001546158 33.64044 -84.42694 41.97960 -87.90446 2079 DEN DFW 138892 0.003064195 39.85841 -104.66700 32.89595 -97.03720 331 ATL DFW 135357 0.004543595 33.64044 -84.42694 32.89595 -97.03720 2221 DFW IAH 134508 0.006013716 32.89595 -97.03720 29.98047 -95.33972 3568 LAS LAX 132333 0.007460065 36.08036 -115.15233 33.94254 -118.40807 2409 DTW ORD 130065 0.008881626 42.21206 -83.34884 41.97960 -87.90446 26
  • 27. 27
  • 28. 28

Editor's Notes