Create functions for web scraping

I built a web scraper to pull a list of jobs on Facebook and other websites but want to break up the code into functions that I can reuse for other websites. This structure is working but I think it can be more efficient with functions. I'm getting stuck on how to structure the functions. It's only pulling two pages for testing.

from time import time

from requests import get

from time import sleep

from random import randint

from IPython.core.display import clear_output

from warnings import warn

from bs4 import BeautifulSoup

import csv



# Range of only 2 pages

pages = [str(i) for i in range(1, 3)]

cities = ["Menlo%20Park%2C%20CA",

          "Fremont%2C%20CA",

          "Los%20Angeles%2C%20CA",

          "Mountain%20View%2C%20CA",

          "Northridge%2CCA",

          "Redmond%2C%20WA",

          "San%20Francisco%2C%20CA",

          "Santa%20Clara%2C%20CA",

          "Seattle%2C%20WA",

          "Woodland%20Hills%2C%20CA"]



# Preparing the monitoring of the loop

start_time = time()

requests = 0



with open('facebook_job_list.csv', 'w', newline='') as f:

    header = csv.writer(f)

    header.writerow(["Website", "Title", "Location", "Job URL"])



for page in pages:

    for c in cities:

        # Requests the html page

        response = get("https://www.facebook.com/careers/jobs/?page=" + page +

                       "&results_per_page=100&locations[0]=" + c)



        # Pauses the loop between 8 and 15 seconds

        sleep(randint(8, 15))



        # Monitor the frequency of requests

        requests += 1

        elapsed_time = time() - start_time

        print("Request:{}; Frequency: {} request/s".format(requests, requests/elapsed_time))

        clear_output(wait=True)



        # Throw a warning for non-200 status codes

        if response.status_code != 200:

            warn("Request: {}; Status code: {}".format(requests, response.status_code))



        # Break the loop if number of requests is greater than expected

        if requests > 2:

            warn("Number of requests was greater than expected.")

            break



        # Parse the content of the request with BeautifulSoup

        page_soup = BeautifulSoup(response.text, 'html.parser')

        job_containers = page_soup.find_all("a", "_69jm")



        # Select all 100 jobs containers from a single page

        for container in job_containers:

            site = page_soup.find("title").text

            title = container.find("div", "_69jo").text

            location = container.find("div", "_1n-z _6hy- _21-h").text

            link = container.get("href")

            job_link = "https://www.facebook.com" + link



            with open('facebook_job_list.csv', 'a', newline='') as f:

                rows = csv.writer(f)

                rows.writerow([site, title, location, job_link])

asked 11 mins ago

iron502

New contributor

add a comment |

from time import time

from requests import get

from time import sleep

from random import randint

from IPython.core.display import clear_output

from warnings import warn

from bs4 import BeautifulSoup

import csv



# Range of only 2 pages

pages = [str(i) for i in range(1, 3)]

cities = ["Menlo%20Park%2C%20CA",

          "Fremont%2C%20CA",

          "Los%20Angeles%2C%20CA",

          "Mountain%20View%2C%20CA",

          "Northridge%2CCA",

          "Redmond%2C%20WA",

          "San%20Francisco%2C%20CA",

          "Santa%20Clara%2C%20CA",

          "Seattle%2C%20WA",

          "Woodland%20Hills%2C%20CA"]



# Preparing the monitoring of the loop

start_time = time()

requests = 0



with open('facebook_job_list.csv', 'w', newline='') as f:

    header = csv.writer(f)

    header.writerow(["Website", "Title", "Location", "Job URL"])



for page in pages:

    for c in cities:

        # Requests the html page

        response = get("https://www.facebook.com/careers/jobs/?page=" + page +

                       "&results_per_page=100&locations[0]=" + c)



        # Pauses the loop between 8 and 15 seconds

        sleep(randint(8, 15))



        # Monitor the frequency of requests

        requests += 1

        elapsed_time = time() - start_time

        print("Request:{}; Frequency: {} request/s".format(requests, requests/elapsed_time))

        clear_output(wait=True)



        # Throw a warning for non-200 status codes

        if response.status_code != 200:

            warn("Request: {}; Status code: {}".format(requests, response.status_code))



        # Break the loop if number of requests is greater than expected

        if requests > 2:

            warn("Number of requests was greater than expected.")

            break



        # Parse the content of the request with BeautifulSoup

        page_soup = BeautifulSoup(response.text, 'html.parser')

        job_containers = page_soup.find_all("a", "_69jm")



        # Select all 100 jobs containers from a single page

        for container in job_containers:

            site = page_soup.find("title").text

            title = container.find("div", "_69jo").text

            location = container.find("div", "_1n-z _6hy- _21-h").text

            link = container.get("href")

            job_link = "https://www.facebook.com" + link



            with open('facebook_job_list.csv', 'a', newline='') as f:

                rows = csv.writer(f)

                rows.writerow([site, title, location, job_link])

asked 11 mins ago

iron502

New contributor

add a comment |

from time import time

from requests import get

from time import sleep

from random import randint

from IPython.core.display import clear_output

from warnings import warn

from bs4 import BeautifulSoup

import csv



# Range of only 2 pages

pages = [str(i) for i in range(1, 3)]

cities = ["Menlo%20Park%2C%20CA",

          "Fremont%2C%20CA",

          "Los%20Angeles%2C%20CA",

          "Mountain%20View%2C%20CA",

          "Northridge%2CCA",

          "Redmond%2C%20WA",

          "San%20Francisco%2C%20CA",

          "Santa%20Clara%2C%20CA",

          "Seattle%2C%20WA",

          "Woodland%20Hills%2C%20CA"]



# Preparing the monitoring of the loop

start_time = time()

requests = 0



with open('facebook_job_list.csv', 'w', newline='') as f:

    header = csv.writer(f)

    header.writerow(["Website", "Title", "Location", "Job URL"])



for page in pages:

    for c in cities:

        # Requests the html page

        response = get("https://www.facebook.com/careers/jobs/?page=" + page +

                       "&results_per_page=100&locations[0]=" + c)



        # Pauses the loop between 8 and 15 seconds

        sleep(randint(8, 15))



        # Monitor the frequency of requests

        requests += 1

        elapsed_time = time() - start_time

        print("Request:{}; Frequency: {} request/s".format(requests, requests/elapsed_time))

        clear_output(wait=True)



        # Throw a warning for non-200 status codes

        if response.status_code != 200:

            warn("Request: {}; Status code: {}".format(requests, response.status_code))



        # Break the loop if number of requests is greater than expected

        if requests > 2:

            warn("Number of requests was greater than expected.")

            break



        # Parse the content of the request with BeautifulSoup

        page_soup = BeautifulSoup(response.text, 'html.parser')

        job_containers = page_soup.find_all("a", "_69jm")



        # Select all 100 jobs containers from a single page

        for container in job_containers:

            site = page_soup.find("title").text

            title = container.find("div", "_69jo").text

            location = container.find("div", "_1n-z _6hy- _21-h").text

            link = container.get("href")

            job_link = "https://www.facebook.com" + link



            with open('facebook_job_list.csv', 'a', newline='') as f:

                rows = csv.writer(f)

                rows.writerow([site, title, location, job_link])

asked 11 mins ago

iron502

New contributor

from time import time

from requests import get

from time import sleep

from random import randint

from IPython.core.display import clear_output

from warnings import warn

from bs4 import BeautifulSoup

import csv



# Range of only 2 pages

pages = [str(i) for i in range(1, 3)]

cities = ["Menlo%20Park%2C%20CA",

          "Fremont%2C%20CA",

          "Los%20Angeles%2C%20CA",

          "Mountain%20View%2C%20CA",

          "Northridge%2CCA",

          "Redmond%2C%20WA",

          "San%20Francisco%2C%20CA",

          "Santa%20Clara%2C%20CA",

          "Seattle%2C%20WA",

          "Woodland%20Hills%2C%20CA"]



# Preparing the monitoring of the loop

start_time = time()

requests = 0



with open('facebook_job_list.csv', 'w', newline='') as f:

    header = csv.writer(f)

    header.writerow(["Website", "Title", "Location", "Job URL"])



for page in pages:

    for c in cities:

        # Requests the html page

        response = get("https://www.facebook.com/careers/jobs/?page=" + page +

                       "&results_per_page=100&locations[0]=" + c)



        # Pauses the loop between 8 and 15 seconds

        sleep(randint(8, 15))



        # Monitor the frequency of requests

        requests += 1

        elapsed_time = time() - start_time

        print("Request:{}; Frequency: {} request/s".format(requests, requests/elapsed_time))

        clear_output(wait=True)



        # Throw a warning for non-200 status codes

        if response.status_code != 200:

            warn("Request: {}; Status code: {}".format(requests, response.status_code))



        # Break the loop if number of requests is greater than expected

        if requests > 2:

            warn("Number of requests was greater than expected.")

            break



        # Parse the content of the request with BeautifulSoup

        page_soup = BeautifulSoup(response.text, 'html.parser')

        job_containers = page_soup.find_all("a", "_69jm")



        # Select all 100 jobs containers from a single page

        for container in job_containers:

            site = page_soup.find("title").text

            title = container.find("div", "_69jo").text

            location = container.find("div", "_1n-z _6hy- _21-h").text

            link = container.get("href")

            job_link = "https://www.facebook.com" + link



            with open('facebook_job_list.csv', 'a', newline='') as f:

                rows = csv.writer(f)

                rows.writerow([site, title, location, job_link])

python functional-programming web-scraping

asked 11 mins ago

iron502

New contributor

asked 11 mins ago

iron502

New contributor

asked 11 mins ago

iron502

New contributor

asked 11 mins ago

iron502

asked 11 mins ago

iron502

New contributor

iron502 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

iron502 is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f216135%2fcreate-functions-for-web-scraping%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

iron502 is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

iron502 is a new contributor. Be nice, and check out our Code of Conduct.

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

63f3vqHwyBL0wSL5RilLVdj4L,HejF2FUQOQANdf7,qHF LJ 1Zh5tC6I

搜尋此網誌

Nsryjdtyk