From 41f9ef3d8668c70c030980f1468ddaddfc3238e3 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 30 Dec 2020 15:08:40 +0300 Subject: [PATCH 1/4] Headless version issue is fixed --- inscrawler/browser.py | 3 +++ output | 1 + 2 files changed, 4 insertions(+) create mode 100644 output diff --git a/inscrawler/browser.py b/inscrawler/browser.py index e139b03..4bb7871 100644 --- a/inscrawler/browser.py +++ b/inscrawler/browser.py @@ -15,8 +15,11 @@ class Browser: def __init__(self, has_screen): dir_path = os.path.dirname(os.path.realpath(__file__)) + useragent = "Mozilla/5.0 (X11; Linux i686; rv:77.0) Gecko/20100101 Firefox/77.0" + service_args = ["--ignore-ssl-errors=true"] chrome_options = Options() + chrome_options.add_argument(f'--user-agent={useragent}') if not has_screen: chrome_options.add_argument("--headless") chrome_options.add_argument("--start-maximized") diff --git a/output b/output new file mode 100644 index 0000000..a7d4a2c --- /dev/null +++ b/output @@ -0,0 +1 @@ +{"name": "Cal Foodie", "desc": "🍴Food explorer🍴\n🤓California based foodie😋\n🇺🇸San Francisco 🍩 Bay Area🍭\n🍖 灣區走一回,吃的都在這🍖\nFB👉facebook.com/calfoodie\nBlog👉medium.com/calfoodie", "photo_url": "https://instagram.fbtz1-9.fna.fbcdn.net/v/t51.2885-19/s150x150/53800836_306482636695676_7044132336971022336_n.jpg?_nc_ht=instagram.fbtz1-9.fna.fbcdn.net&_nc_ohc=adcijP0gVJkAX9HZjQ_&tp=1&oh=ff1ca2d45d977a55c83176fe71428340&oe=60143CE3", "post_num": "332", "follower_num": "669", "following_num": "373"} \ No newline at end of file From 93cb9d7fe6356b98f19b277cee6fe0b2c24cc7b4 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 30 Dec 2020 15:09:58 +0300 Subject: [PATCH 2/4] Headless version issue is fixed --- output | 1 - 1 file changed, 1 deletion(-) delete mode 100644 output diff --git a/output b/output deleted file mode 100644 index a7d4a2c..0000000 --- a/output +++ /dev/null @@ -1 +0,0 @@ -{"name": "Cal Foodie", "desc": "🍴Food explorer🍴\n🤓California based foodie😋\n🇺🇸San Francisco 🍩 Bay Area🍭\n🍖 灣區走一回,吃的都在這🍖\nFB👉facebook.com/calfoodie\nBlog👉medium.com/calfoodie", "photo_url": "https://instagram.fbtz1-9.fna.fbcdn.net/v/t51.2885-19/s150x150/53800836_306482636695676_7044132336971022336_n.jpg?_nc_ht=instagram.fbtz1-9.fna.fbcdn.net&_nc_ohc=adcijP0gVJkAX9HZjQ_&tp=1&oh=ff1ca2d45d977a55c83176fe71428340&oe=60143CE3", "post_num": "332", "follower_num": "669", "following_num": "373"} \ No newline at end of file From 6aea3603336fec96e8079efcf343f8e40b593f5b Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 30 Dec 2020 15:41:26 +0300 Subject: [PATCH 3/4] chromedriver update --- .gitignore | 3 ++- inscrawler/bin/.keep | 0 inscrawler/browser.py | 4 ++-- requirements.txt | 3 ++- 4 files changed, 6 insertions(+), 4 deletions(-) delete mode 100644 inscrawler/bin/.keep diff --git a/.gitignore b/.gitignore index fbf80db..2760a16 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,5 @@ node_modules/ secret.py inscrawler/bin/* -!inscrawler/bin/.keep \ No newline at end of file +!inscrawler/bin/.keep +output \ No newline at end of file diff --git a/inscrawler/bin/.keep b/inscrawler/bin/.keep deleted file mode 100644 index e69de29..0000000 diff --git a/inscrawler/browser.py b/inscrawler/browser.py index 4bb7871..f62e7f8 100644 --- a/inscrawler/browser.py +++ b/inscrawler/browser.py @@ -8,13 +8,13 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.keys import Keys +from webdriver_manager.chrome import ChromeDriverManager from .utils import randmized_sleep class Browser: def __init__(self, has_screen): - dir_path = os.path.dirname(os.path.realpath(__file__)) useragent = "Mozilla/5.0 (X11; Linux i686; rv:77.0) Gecko/20100101 Firefox/77.0" service_args = ["--ignore-ssl-errors=true"] @@ -25,7 +25,7 @@ def __init__(self, has_screen): chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--no-sandbox") self.driver = webdriver.Chrome( - executable_path="%s/bin/chromedriver" % dir_path, + ChromeDriverManager().install(), service_args=service_args, chrome_options=chrome_options, ) diff --git a/requirements.txt b/requirements.txt index 0ce9a2f..9bf3963 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ future==0.16.0 selenium==3.9.0 tqdm==4.23.4 pre-commit==1.16.1 -black==19.3b0 \ No newline at end of file +black==19.3b0 +webdriver-manager==3.2.2 \ No newline at end of file From c99dd41edd33a32298dc47c743d1bf923f5b1b67 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 30 Dec 2020 15:42:38 +0300 Subject: [PATCH 4/4] chromedriver update --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e439a84..0cac2cc 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,8 @@ This crawler could fail due to updates on instagram’s website. If you encounte ## Install 1. Make sure you have Chrome browser installed. -2. Download [chromedriver](https://sites.google.com/a/chromium.org/chromedriver/) and put it into bin folder: `./inscrawler/bin/chromedriver` -3. Install Selenium: `pip3 install -r requirements.txt` -4. `cp inscrawler/secret.py.dist inscrawler/secret.py` +2. Install Selenium: `pip3 install -r requirements.txt` +3. `cp inscrawler/secret.py.dist inscrawler/secret.py` ## User Auth 1. Open `inscrawler/secret.py` file.