0% found this document useful (0 votes)
8 views

Pig

Uploaded by

sumeetmkhetan
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views

Pig

Uploaded by

sumeetmkhetan
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

=================================================================

PIG Architecture
=================================================================
Two modes :

1) Local - use for files on local system. processing is done on local machine
2)Map reduce - use for HDFS i.e shared file system. processing is done in
distributed manner i.e on multiple michines/nodes.
3)Hcatlog - pig -useHCatlog

=================================================================
DATA INGESTION (SCHEMALESS LOADING)
=================================================================

To go to pig :

pig -x local

EMP = LOAD '/home/itelligence/Dataset/CSV/EMP.csv' USING PigStorage(',');

DUMP EMP

SELECT = GENERATE
FROM = FOREACH

A= FOREACH EMP GENERATE $1,$2; ---------------FOR 1 and 2 COLUMN


A= FOREACH EMP GENERATE $2..$5; ------FOR COLUMNS from 2 to 5
A= FOREACH EMP GENERATE *; -------------FOR ALL columns

=================================================================
DATA INGESTION (SCHEMA BASED LOADING)
=================================================================

DEFAULT DATATYPE IN PIG IS 'BYTEARRAY'


i.e if you describe any variable it will show its datatype as 'BYTEARRAY'

EMP = LOAD '/home/itelligence/Dataset/csv/EMP.csv' USING PigStorage(',') as


(EMPLOYEE_ID:INT,FIRST_NAME:CHARARRAY,LAST_NAME:CHARARRAY,EMAIL:CHARARRAY,PHONE_NUM
BER:CHARARRAY,HIRE_DATE:CHARARRAY,JOB_ID:CHARARRAY,SALARY:INT,MANAGER_ID:INT,DEPART
MENT_ID:INT);

DESCRIBE B ----TO SEE THE SCHEMA

=================================================================
FILTERING
=================================================================

B = FILTER EMP BY SALARY>5000;

B =FILTER EMP BY SUBSTRING(FIRST_NAME,0,1) =='A' OR SUBSTRING(FIRST_NAME,0,1) =='a'

OR

B = foreach EMP generate FIRST_NAME, SUBSTRING(FIRST_NAME,0,1) =='A'

PIG FUNCTION LIBRARY :


/home/itelligence/pig-0.11.1/docs/func.html
ORDER AMMOUNT BETWEEN 200 AND 400 IN FILE ORDER.CSV
ORDER AMMOUNT== $17

A = Load '/home/itelligence/orders.csv' using PigStorage(',');

B = FOREACH A generate $0,$17;


B1= FILTER B BY SUBSTRING($1,0,1) == '$';

C= FOREACH A generate $0,$18;


C1= FILTER C BY SUBSTRING($1,0,1) == '$';

D = FOREACH A generate $0,$19;


D1= FILTER D BY SUBSTRING($1,0,1) == '$';

E = UNION B1,C1,D1;

F = FOREACH E GENERATE $0, (INT)SUBSTRING(TRIM($1),1,20);

G = FILTER F BY $1 >200 AND $1<4000;

Dump G ;
=================================================================
LOAD DATA WITH MULTIPLE DELIMITER
=================================================================

When there is multiple delimiter present in file like (,:..) then we first load the
file to pig without PigStorage and then using REGEX_EXTRACT_ALL function we divide
the fields in coma seperated

A = LOAD '/home/itelligence/Untitled Document 1' ;

B = FOREACH A GENERATE FLATTEN(REGEX_EXTRACT_ALL($0,'(.*) (.*),(.*):(.*)'));

STORE B INTO '/home/itelligence/sumeet' USING PigStorage(',');


=================================================================
Group by
=================================================================

EMP = LOAD '/home/itelligence/Dataset/CSV/EMP.csv' USING PigStorage(',');

A = GROUP EMP BY $9;

To find the number of employee in each department ($0) :


B = FOREACH A GENERATE $0, COUNT($1);

DUMP B;

To find the avg salary($7) of employee in each department ($0) :


C = FROEACH A GENERATE $0,AVG($1.$7);

DUMP C;

==============================================================
SCRIPT TO FILTER ONLY ERROR CODE AND ERROR MESSAGE FROM THE LOG FILE :
LOG = LOAD '/home/itelligence/pig_1491722865605.log';

A = FILTER LOG BY SUBSTRING($0,0,5) == 'ERROR';

B = FOREACH A GENERATE FLATTEN(REGEX_EXTRACT_ALL($0,'ERROR (.*):(.*)'));

DUMP B;
=================================================================
JOINS
=================================================================

FOR LEFT OUTER AND RIGHT OUTER JOINS YOU NEED TO SPECIFY THE SECEMA OF THE FILES
WHILE LOADING THE FILES

FOR LEFT OUTER SPECIFY SCHEMA FOR LEFT FILE AND VICE VERSA

EMP = LOAD '/home/itelligence/Dataset/CSV/EMP.csv' USING PigStorage (',') AS ();


DEP = LOAD '/home/itelligence/DEPT' USING PigStorage (',') AS ();
JOIN_INNER = JOIN EMP BY $9,DEP BY $0;
JOIN_LO = JOIN EMP BY $9 LEFT OUTER,DEP BY $0;
JOIN_RO = JOIN EMP BY $9 RIGHT OUTER,DEP BY $0;
JOIN_FO = JOIN EMP BY $9 FULL OUTER,DEP BY $0;

REPLICATED JOIN :

IN THIS CASE WHEN WE JOIN THE TWO TABLES THE SMALLER TABLE IS COPPIED TO THE NODE
WHERE ALL THE BOLCKS OF THE LARGER TABLE IS PLACED SO THAT LOOK UP CAN BE
PERFORMED. USED FOR PERFORMANCE OPTIMIZATION.

SKEWED JOIN :

MERG JOIN :

============================SCHEMA ON READ=======================

=================================================================
COGROUP
=================================================================

CG = COGROUP EMP BY $9,DEPT BY $0;

A = FOREACH CG GENERATE $0,COUNT($1),COUNT($2);


=================================================================
JSONLOADER
=================================================================

IN ORDER TO LOAD THE JSON FILE WE USE JSONLOADER INSTEAD OF PigStorage


WHILE USING JSONLOADER SCHEMA IS SPECIFIED IN SINGLE QUOTES ('')

A = LOAD '<JSONFILE PATH>' USING JsonLoader;


DUMP A;
=================================================================
UDF
=================================================================

CREATE PROJECT ==> ADD LIBRARIES(HADOOP LIBRARIES,PIG LIBRARIES) ==> CREATE PACKAGE
==> CREATE CLASS ==> PASTE PROGRAM ==> EXPORT TO JAR

now to tell pig about udf location :


REGISTER <LOCATION OF JAR FILE>

package myudfs;
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
public class ToUpper extends EvalFunc<String>
{ public String exec(Tuple input) throws IOException
{
if (input == null || input.size() == 0)
return null;

else {
String str = input.get(0).toString();
return str.toUpperCase();
}
}

=================================================================
PIGYBANK
=================================================================

contains predefined UDF. First register the piggybank in pig and then we can use
any UDF in pigybank

=================================================================
XML FILE LOADING TO PIG
=================================================================

A = LOAD '/home/itelligence/hadoop-1.2.0/conf/hdfs-site.xml' USING


org.apache.pig.piggybank.storage.XMLLoader('property');

B = FOREACH A GENERATE
FLATTEN(REGEX_EXTRACT_ALL($0,'<property>\\s*<name>(.*)</name>\\s*<value>(.*)</
value>\\s*</property>'));

here \\s* is used to remove the spaces(multiple spaces)d


use \\s to remove single space
use \s to insert spaces
=================================================================
DYNAMIC PATH
=================================================================

(writing shell comands in file and then executing file directly either from
hadoop($) prompt or grunt prompt)

from $ prompt :

pig -x local --param PATH=/home/itelligence/Desktop/yatra.txt


/home/itelligence/Desktop/script_ns.pig

FROM GRUNT SHELL:

exec /home/itelligence/Desktop/script_ns.pig
=================================================================
WORD COUNT IN FLAT FILE
=================================================================

A = LOAD 'FILE PATH';

B = FOREACH A GENERATE FLATTEN(TOKENIZE($0));

B1 = FOREACH B GENERATE REPLACE ($0,'[^a-zA-Z0-9]','');

---------------this is the generalised expression to replace all the special


characters with null

C = GROUP B BY $0;

D = FOREACH C GENERATE $0, COUNT($1);

=======================================================
START-UP

employees = LOAD '/home/itelligence/Dataset/CSV/EMP.csv' USING PigStorage(',') AS


(EMPLOYEE_ID:int,FIRST_NAME:chararray,LAST_NAME:chararray,
EMAIL:chararray,PHONE_NUMBER:chararray,HIRE_DATE:chararray,
JOB_ID:chararray,SALARY:int,MANAGER_ID:int,DEPARTMENT_ID:int);

DESCRIBE employees;

EMP_FILTERED = FILTER employees BY SALARY > 5000;

NESTED = FOREACH EMP_FILTERED GENERATE HIRE_DATE, SUBSTRING(HIRE_DATE,0,2) AS DAY,


SUBSTRING(HIRE_DATE,3,6) AS MONTH;

GRP = GROUP NESTED BY (MONTH, DAY);

M = FOREACH GRP GENERATE group, AVG(EMP_FILTERED.SALARY), COUNT(EMP_FILTERED.$0);

STORE employees INTO '/home/itelligence/Dataset/PigEmitted' USING PigStorage('~','-


schema');

==============================================================
JOIN

TRANSACTIONS = load 'data/transactions' using PigStorage('\t') as (id:int,


product:int, user:int, purchase_amount:double, description:chararray);

A = JOIN TRANSACTIONS by user LEFT OUTER, USERS by id;

B = GROUP A by product;

C = FOREACH B {
LOCS = DISTINCT A.location;
GENERATE group, COUNT(LOCS) as location_count;
};

DUMP C
==============================================================
WORDCOUNT

A = load '------/input.txt';
B = foreach A generate flatten(TOKENIZE((chararray)$0)) as word;
C = group B by word;
D = foreach C generate COUNT(B), group;
store D into '------/wordcount';

==============================================================
-- Problem Stmt : find the number of items bought by each customer
-- which item he/she bought highest time.
-- load the input data :: Schema ( customerId , itemId , order Date, delivery
Date );

orders = load '/testData100k' using PigStorage(',') as (cstrId:int, itmId:int,


orderDate: long, deliveryDate: long );

-- group by customer-id and item-id

grpd_cstr_itm = group orders by (cstrId,itmId);


grpd_cstr_itm_cnt = foreach grpd_cstr_itm generate group.cstrId as cstrId,
group.itmId as itmId, COUNT(orders) as itmCnt;

-- group by cstrId

grpd_cstr = group grpd_cstr_itm_cnt by cstrId ;


describe grpd_cstr;

-- grpd_cstr: {group: int,grpd_cstr_itm_cnt: {cstrId: int,itmId: int,itmCnt: long}}


-- iterate over grpd_cstr_itm and find total number of items bought by customer and
which item he/or she bought higest time.
result = foreach grpd_cstr{
total_orders = SUM(grpd_cstr_itm_cnt.itmCnt);
srtd_orders = order grpd_cstr_itm_cnt by itmCnt desc;
higest_bought = limit srtd_orders 1;
generate FLATTEN(higest_bought),total_orders as totalCnt;
};
-- result will contains ( customer_id , itm_id_bought_higest_times,
number_of_times_it_bought, total_items);
describe result;
-- result: {higest_bought::cstrId: int,higest_bought::itmId:
int,higest_bought::itmCnt: long,totalCnt: long}

==============================================================

You might also like