Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fetch_comment #126

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ node_modules/

secret.py
inscrawler/bin/*
!inscrawler/bin/.keep
!inscrawler/bin/.keep
view.ipynb
38 changes: 38 additions & 0 deletions consumer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import json
import logging
from kafka import KafkaConsumer

def forgiving_json_deserializer(v):
if v is None : return
try:
return json.loads(v.decode('utf-8'))
except json.decoder.JSONDecodeError:
logging.exception('Unable to decode: %s', v)
return None

# topic, broker list
consumer = KafkaConsumer(
'test',
bootstrap_servers=['localhost:9092'],
auto_offset_reset='earliest',
# (1) consume from the tail of the topic instead:
# auto_offset_reset='latest'
# (2) start a new topic:
# consumer.subscribe(['offering_new_too'])
enable_auto_commit=True,
group_id='1',
# value_deserializer=lambda x: loads(x.decode('utf-8')),
value_deserializer=forgiving_json_deserializer,
consumer_timeout_ms=1000
)

# # consume json messages
# KafkaConsumer(value_deserializer=lambda m: json.loads(m.decode('ascii')))

# consumer list를 가져온다
print('[begin] get consumer list')
for message in consumer:
print("Topic: %s, Partition: %d, Offset: %d, Key: %s, Value: %s" % (
message.topic, message.partition, message.offset, message.key, message.value
))
print('[end] get consumer list')
1,133 changes: 1,133 additions & 0 deletions crawl.json

Large diffs are not rendered by default.

74 changes: 74 additions & 0 deletions data_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
import os

path_dir = './output'
file_list = os.listdir(path_dir)

hot_data = None
data = None
for jsonfile in file_list :
if jsonfile[:2] == 'p1':
with open(jsonfile, 'r') as file:
data1 = json.load(file)
hot_data += data1[:9]
data += data1[9:]
# 파일 새로 쓰기
with open(os.path.join(path_dir,"p1_hot.json"), "a") as new_file :
json.dump(hot_data, new_file, ensure_ascii = False, indent=4)
with open(os.path.join(path_dir,"p1_recent.json"), "a") as new_file :
json.dump(data, new_file, ensure_ascii = False, indent=4)

elif jsonfile[:2] == 'p2':
with open(jsonfile, 'r') as file:
data1 = json.load(file)
hot_data += data1[:9]
data += data1[9:]
# 파일 새로 쓰기
with open(os.path.join(path_dir,"p2_hot.json"), "a") as new_file :
json.dump(hot_data, new_file, ensure_ascii = False, indent=4)
with open(os.path.join(path_dir,"p2_recent.json"), "a") as new_file :
json.dump(data, new_file, ensure_ascii = False, indent=4)

elif jsonfile[:2] == 'p3':
with open(jsonfile, 'r') as file:
data1 = json.load(file)
hot_data += data1[:9]
data += data1[9:]
# 파일 새로 쓰기
with open(os.path.join(path_dir,"p3_hot.json"), "a") as new_file :
json.dump(hot_data, new_file, ensure_ascii = False, indent=4)
with open(os.path.join(path_dir,"p3_recent.json"), "a") as new_file :
json.dump(data, new_file, ensure_ascii = False, indent=4)

elif jsonfile[:2] == 'p4':
with open(jsonfile, 'r') as file:
data1 = json.load(file)
hot_data += data1[:9]
data += data1[9:]
# 파일 새로 쓰기
with open(os.path.join(path_dir,"p4_hot.json"), "a") as new_file :
json.dump(hot_data, new_file, ensure_ascii = False, indent=4)
with open(os.path.join(path_dir,"p4_recent.json"), "a") as new_file :
json.dump(data, new_file, ensure_ascii = False, indent=4)

elif jsonfile[:2] == 'p5':
with open(jsonfile, 'r') as file:
data1 = json.load(file)
hot_data += data1[:9]
data += data1[9:]
# 파일 새로 쓰기
with open(os.path.join(path_dir,"p5_hot.json"), "a") as new_file :
json.dump(hot_data, new_file, ensure_ascii = False, indent=4)
with open(os.path.join(path_dir,"p5_recent.json"), "a") as new_file :
json.dump(data, new_file, ensure_ascii = False, indent=4)

elif jsonfile[:2] == 'p6':
with open(jsonfile, 'r') as file:
data1 = json.load(file)
hot_data += data1[:9]
data += data1[9:]
# 파일 새로 쓰기
with open(os.path.join(path_dir,"p6_hot.json"), "a") as new_file :
json.dump(hot_data, new_file, ensure_ascii = False, indent=4)
with open(os.path.join(path_dir,"p6_recent.json"), "a") as new_file :
json.dump(data, new_file, ensure_ascii = False, indent=4)
5 changes: 3 additions & 2 deletions inscrawler/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ def __init__(self, has_screen):
dir_path = os.path.dirname(os.path.realpath(__file__))
service_args = ["--ignore-ssl-errors=true"]
chrome_options = Options()
if not has_screen:
chrome_options.add_argument("--headless")
# if not has_screen:
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("user-agent="+UserAgent().random)
Expand All @@ -28,6 +28,7 @@ def __init__(self, has_screen):
service_args=service_args,
chrome_options=chrome_options,
)
self.driver.maximize_window()
self.driver.implicitly_wait(5)

@property
Expand Down
19 changes: 14 additions & 5 deletions inscrawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def check_next_post(cur_key):
browser = self.browser
browser.implicitly_wait(1)
browser.scroll_down()
ele_post = browser.find_one(".v1Nh3 a")
ele_post = browser.find_one("._aao7 a")
ele_post.click()
dict_posts = {}

Expand Down Expand Up @@ -268,16 +268,25 @@ def _get_posts(self, num):
pbar = tqdm(total=num)

def start_fetching(pre_post_num, wait_time):
ele_posts = browser.find(".v1Nh3 a")
ele_posts = browser.find("._aabd a")
# print(ele_posts)
for ele in ele_posts:
key = ele.get_attribute("href")
# print(key)
if key not in key_set:
dict_post = { "key": key }
ele_img = browser.find_one(".KL4Bh img", ele)
ele_img = browser.find_one("div._aagv img", ele)
dict_post["caption"] = ele_img.get_attribute("alt")
# print(dict_post["caption"])
dict_post["img_url"] = ele_img.get_attribute("src")

fetch_details(browser, dict_post)
# print(dict_post["img_url"])
fetch_details(browser, dict_post) #description 포함
fetch_datetime(browser, dict_post)
# fetch_imgs(browser, dict_post)
fetch_likes_plays(browser, dict_post)
# fetch_likers(browser, dict_post)
# fetch_caption(browser, dict_post)
fetch_comments(browser, dict_post)

key_set.add(key)
posts.append(dict_post)
Expand Down
Loading