Extract Google Trends Data with Python

Anyone who has regularly worked with Google Trends data has had to deal with the slightly tedious task of grabbing keyword level data and reformatting the spreadsheet provided by Google. After looking for a seamless way to pull the data, I came upon the PyTrends library on GitHub, and sought to put together some quick user defined functions to manage the task of pulling daily and weekly trends data.


# set working directory (location where code is)
import os
os.chdir("path")

import re
import csv
import time
import pandas as pd
from random import randint
from GT_Automation_Code import pyGTrends

# set gmail credentials and path to extract data
google_username = "*****@gmail.com"
google_password = "*****"

Daily_Data = [ ]
        
# define daily pull code
def GT_Daily_Run(keys):
    
    path = 'path'

    # connect to Google
    connector = pyGTrends(google_username, google_password)
    # make request
    connector.request_report(keys, date="today 90-d", geo="US")
    # wait a random amount of time between requests to avoid bot detection
    time.sleep(randint(5, 10))
    # download file
    connector.save_csv(path, '_' + "GT_Daily" + '_' + keys.replace(' ', '_'))

    name = path + '_' + "GT_Daily" + '_' + keys.replace(' ', '_')
      
    with open(name + '.csv', 'rt') as csvfile:    
        csvReader = csv.reader(csvfile)
        data = []

        for row in csvReader:
            if any('2015' in s for s in row): 
                data.append(row)

        day_df = pd.DataFrame(data)
        cols = ["Day", keys]    
        day_df.columns = [cols]  
        Daily_Data.append(day_df) 

keywords = ['soccer', 'football', 'baseball']    

map(lambda x: GT_Daily_Run(x), keywords)  

rge = [Daily_Data[0], Daily_Data[1], Daily_Data[2]]    

df_final_daily = reduce(lambda left,right: pd.merge(left,right, on='Day'), rge)
df_final_daily = df_final_daily.loc[:, (df_final_daily != "0").any(axis=0)]
df_final_daily.to_csv("Daily_Trends_Data.csv", index=False)

Weekly_Data = [ ]     
        
# define weekly pull code
def GT_Weekly_Run(keys):

    path = 'path'
    
    # connect to Google
    connector = pyGTrends(google_username, google_password)
    # make request
    connector.request_report(keys, geo="US")
    # wait a random amount of time between requests to avoid bot detection
    time.sleep(randint(5, 10))
    # download file
    connector.save_csv(path, '_' + "GT_Weekly" + '_' + keys.replace(' ', '_'))

    name = path + '_' + "GT_Weekly" + '_' + keys.replace(' ', '_') 

    with open(name + '.csv', 'rt') as csvfile:    
        csvReader = csv.reader(csvfile)
        data = []
        datex = re.compile('(19|20)dd-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])')

        for row in csvReader:
            if datex.search(str(row)):
                data.append(row)
       
        week_df = pd.DataFrame(data)
        cols = ["Week", keys]    
        week_df.columns = [cols]
        Weekly_Data.append(week_df)              
             
map(lambda x: GT_Weekly_Run(x), keywords)  

rge = [Weekly_Data[0], Weekly_Data[1], Weekly_Data[2]]    

df_final_weekly = reduce(lambda left,right: pd.merge(left,right, on='Week'), rge)
df_final_weekly = df_final_weekly.loc[:, (df_final_weekly != "0").any(axis=0)] 
df_final_weekly.to_csv("Weekly_Trends_Data.csv", index=False)


Advertisements

A Few Days of Python: Automating Tasks Involving Excel Files

There are plenty of instances where analysts are regularly forwarded xls spreadsheets and tasked with summarizing the data. In many cases, these scenarios can be automated through fairly simple Python scripts. In the following code, I take an Excel spreadsheet with two sheets, summarize each sheet using a pivot table, and add those results to sheets in a new spreadsheet.


# IMPORT MODULES
import os 
import sys
import pandas as pd
import numpy as np
import xlsxwriter


def automate(file1, file2):

    # check file
    if os.stat(file1).st_size == 0:
       print "File Status: Empty"
       sys.exit("Error")
    else:
       xl = pd.ExcelFile(file1)
       writer = pd.ExcelWriter(file2, engine='xlsxwriter')

    # Sheet1 pivot 
    dat1 = xl.parse("Sheet1")
    dat1_result = dat1[["buyer_aid","lead_payout"]].groupby("buyer_aid").aggregate(np.mean).reset_index().sort(['lead_payout'], ascending=[False])
    dat1_result.to_excel(writer, sheet_name='Average_Payout', index=False, header=True)
    
    # Sheet2 pivot 
    dat2 = xl.parse("Sheet2")
    dat2_result = dat2[["buyer_aid","lead_payout"]].groupby("buyer_aid").aggregate(np.sum).reset_index().sort(['lead_payout'], ascending=[False])
    dat2_result.to_excel(writer, sheet_name='Sum_Payout', index=False, header=True)
    
    dat3 = pd.merge(dat1_result, dat2_result, how="inner")
    dat3.to_excel(writer, sheet_name='All_Results', index=False, header=True)
    
    # save the Excel file.    
    writer.save()

    
automate("LP_Data.xls", 'New_Report.xls')

A Few Days of Python: Using R in Python

Using R Functions in Python

import pandas as pd
import pyper as pr

def zone_func(zone_file):

    # IMPORT DATA:
    dat = pd.read_csv(zone_file, parse_dates=["Row Labels"])

    # CREATE A R INSTANCE WITH PYPER:    
    r = pr.R()
    
    # PASS DATA FROM PYTHON TO R:
    r.assign("rsfores", dat["forest"]) 
    
    # RUN R COMMANDS 
    forecast = r("""
            library(forecast)
            forecast_func <- function(vehicle){
                a_fit <- auto.arima(vehicle)
                a_forecast <- forecast(a_fit, h = 8)
                a_accuracy <- accuracy(a_forecast)
            }

            fores_output = forecast_func("rsfores")
    """)

    #  EXTRACT R OBJECT AS PANDAS DATA FRAME 
    pydata = pd.DataFrame(r.get("fores_output"))
    print pydata

zone_func("File_Atlanta.csv")

Homework during the hiring process…no thanks!

Not too long ago, I was on the job market looking for work as an applied statistician or data scientist within the the online marketing industry. One thing I’ve come to expect with almost every company is some sort of homework assignment or challenge where a spreadsheet would be presented along with some guidelines on what type of analysis they would like. Sometimes it’s very open ended and at other times, there are specific tasks and questions which are put forth. Initially, I saw these assignments as something fun where I could showcase my skill set. However, since last month, I’ve come to see them as a nuisance which can’t possible be a good indicator of whether someone is ‘worth hiring’ or not. I get it, companies often get inundated with resumes and they need effective processes to sift through them. And I get the value of getting some document which outlines how an applicant thought about a problem and generated some valuable insights.

With all that said, do we seriously think that homework assignments and challenges during the hiring process are the most effective way of getting the “best candidate” (whatever that means). I don’t have any data to suggest either way, but am inclined to believe that companies and analytics hiring managers need to develop better ways of assessing the quality of candidates. It these assignments are really about assessing who is most serious about a role to spend a few hours of their free time answering some ‘simple’ questions and putting together some basic lines of R or Python code, then so be it. But I think a better process can be put forth that allows companies to find the right candidate.

I’ve been part of the hiring process and I’ve also gone through months of looking for employment. Based on my experiences on both sides of the table, here’s my view of what is most effective when looking for analytics professionals, applied statistician, or data scientists. Ultimately, my feeling is that the only way to assess whether a candidate is worth hiring is by effectively testing prospective candidates in a more formal manner. The key is to have the applicant complete this stuff during the interview as that would remove the task from being characterized as a take home homework assignment.

Part 1: Quantitative Skills
To assess a candidates quantitative proficiency, here are some techniques that work well based on my previous experience.
a. Put together a document with an existing business problem and some of the analysis that’s been put together to answer them. Ask the applicant for suggestions on the limitations of the current approach and what they’d do if that project was handed to them.
b. Put together a basic statistics test which inquires about simple probability theory and inferential statistical principles. Ask the candidate to answer those questions in an informal setting to ascertain what they know and how work through problems when they don’t know the answer.
c. Ask the applicant to read a statistically demanding document and then request a summary plus feedback from the candidate. This should also tell us something about what the candidate knows about statistics and whether they can summarize the relevant parts in a satisfactory manner.

Part 2: Technical Skills
To assess a candidates technical proficiency, here are some techniques that work well based on my previous experience.
a. Show an applicant some imperfect code that is unnecessarily long or could be improved. Ask them to look it over and provide their suggestions on how’d they do things differently.
b.Put together several small code snippets in various programming languages that the candidate may or may not know. Ask them to go through the code, identify what is happening at each step, and explain the final result.
c. Have the applicant share their work on some interesting work or non work related project that they did recently. They can talk about specific aspects of their code and consider if there is anything they’d do differently now.

The possibilities are endless, but there has to be better ways to assess the quality of candidates to analytics roles than the ‘homework assignment.’ In any case, I’ll be refusing to do any more assignments as a part of the hiring process.

 

PS: In time, I finally opened up to the notion of ‘technical assignments.’ Don’t get me wrong, I have and will never use them when I am in a hiring capacity, but I finally accepted that I’d have to do a few homework assignments here and there.

PPS: Oddly enough, I’ve received a few rough/rude emails from hiring managers regarding this post. If you don’t like my perspective, feel free to write your own blog post about it. Personally, I don’t see the need for someone to send a mouthy email to me just because they have a different perspective.

 

 

Turning Data Into Awesome With sqldf and pandasql

Both R and Python possess libraries for using SQL statements to interact with data frames. While both languages have native facilities for manipulating data, the sqldf and pandasql provide a simple and elegant interface for conducting tasks using an intuitive framework that’s widely used by analysts.

Screenshot from 2015-04-29 11:23:02

Screenshot from 2015-04-29 11:23:44

 

 

 

 

R and sqldf

sqldf("SELECT COUNT(*) FROM df2 WHERE state = 'CA'")
 
  COUNT(*)
1        4
 
sqldf("SELECT 
            df2.firstname, 
            df2.lastname, 
            df1.var1, 
            df2.state 
      FROM df1
      INNER JOIN df2 ON df1.personid = df2.id 
      WHERE df2.state = 'TX'")
 
  firstname lastname  var1 state
1     David    Spade -2.09    TX
2       Joe  Montana  1.16    TX
 
sqldf("SELECT 
            df2.state, 
            COUNT(df1.var1) 
      FROM df1
      INNER JOIN df2 ON df1.personid = df2.id WHERE df1.var1 > 0
      GROUP BY df2.state")
 
   state COUNT(df1.var1)
1     AZ               1
2     CA               1
3     GA               1
4     IL               1
5     NC               1
6     NY               1
7     OK               1
8     SC               1
9     TX               1
10    VT               1

Python and pandasql

import pandasql as ps
 
 
q1 = """SELECT COUNT(*) FROM df2 WHERE state = 'CA'"""
 
print ps.sqldf(q1, locals())
 
   COUNT(*)
0         4
 
q2 = """
    SELECT 
        df2.firstname, 
        df2.lastname, 
        df1.var1, 
        df2.state 
    FROM df1 INNER JOIN df2 ON df1.personid = df2.id 
    WHERE df2.state = "TX";
    """
 
print ps.sqldf(q2, locals())
 
  firstname lastname  var1 state
0     David    Spade -2.09    TX
1       Joe  Montana  1.16    TX
 
q3 = """SELECT 
            df2.state, 
            COUNT(df1.var1) 
      FROM df1
      INNER JOIN df2 ON df1.personid = df2.id WHERE df1.var1 > 0
      GROUP BY df2.state"""
 
print ps.sqldf(q3, locals())
 
  state  COUNT(df1.var1)
0    AZ                1
1    CA                1
2    GA                1
3    IL                1
4    NC                1
5    NY                1
6    OK                1
7    SC                1
8    TX                1
9    VT                1