huaying · haaaein · Feb 27, 2022 · Mar 4, 2022 · Mar 6, 2022 · Mar 14, 2022
diff --git a/.gitignore b/.gitignore
@@ -23,4 +23,5 @@ node_modules/
 
 secret.py
 inscrawler/bin/*
-!inscrawler/bin/.keep
+!inscrawler/bin/.keep
+view.ipynb
diff --git a/consumer.py b/consumer.py
@@ -0,0 +1,38 @@
+import json
+import logging 
+from kafka import KafkaConsumer
+
+def forgiving_json_deserializer(v):
+    if v is None : return
+    try:
+        return json.loads(v.decode('utf-8'))
+    except json.decoder.JSONDecodeError:
+        logging.exception('Unable to decode: %s', v)
+        return None
+
+# topic, broker list
+consumer = KafkaConsumer(
+    'test',
+     bootstrap_servers=['localhost:9092'],
+     auto_offset_reset='earliest',
+    # (1) consume from the tail of the topic instead: 
+    #  auto_offset_reset='latest'
+    # (2) start a new topic: 
+    # consumer.subscribe(['offering_new_too'])
+     enable_auto_commit=True,
+     group_id='1',
+    #  value_deserializer=lambda x: loads(x.decode('utf-8')),
+     value_deserializer=forgiving_json_deserializer,
+     consumer_timeout_ms=1000
+)
+
+# # consume json messages
+# KafkaConsumer(value_deserializer=lambda m: json.loads(m.decode('ascii')))
+
+# consumer list를 가져온다
+print('[begin] get consumer list')
+for message in consumer:
+    print("Topic: %s, Partition: %d, Offset: %d, Key: %s, Value: %s" % (
+        message.topic, message.partition, message.offset, message.key, message.value
+    ))
+print('[end] get consumer list')
diff --git a/crawl.json b/crawl.json
diff --git a/data_preprocess.py b/data_preprocess.py
@@ -0,0 +1,74 @@
+import json
+import os
+
+path_dir = './output'
+file_list = os.listdir(path_dir)
+
+hot_data = None
+data = None
+for jsonfile in file_list : 
+    if jsonfile[:2] == 'p1':
+        with open(jsonfile, 'r') as file:
+            data1 = json.load(file)
+            hot_data += data1[:9]
+            data += data1[9:]
+        # 파일 새로 쓰기
+        with open(os.path.join(path_dir,"p1_hot.json"), "a") as new_file :
+            json.dump(hot_data, new_file, ensure_ascii = False, indent=4)  
+        with open(os.path.join(path_dir,"p1_recent.json"), "a") as new_file :
+            json.dump(data, new_file, ensure_ascii = False, indent=4)
+
+    elif jsonfile[:2] == 'p2':
+        with open(jsonfile, 'r') as file:
+            data1 = json.load(file)
+            hot_data += data1[:9]
+            data += data1[9:]
+        # 파일 새로 쓰기
+        with open(os.path.join(path_dir,"p2_hot.json"), "a") as new_file :
+            json.dump(hot_data, new_file, ensure_ascii = False, indent=4)  
+        with open(os.path.join(path_dir,"p2_recent.json"), "a") as new_file :
+            json.dump(data, new_file, ensure_ascii = False, indent=4)
+
+    elif jsonfile[:2] == 'p3':
+        with open(jsonfile, 'r') as file:
+            data1 = json.load(file)
+            hot_data += data1[:9]
+            data += data1[9:]
+        # 파일 새로 쓰기
+        with open(os.path.join(path_dir,"p3_hot.json"), "a") as new_file :
+            json.dump(hot_data, new_file, ensure_ascii = False, indent=4)  
+        with open(os.path.join(path_dir,"p3_recent.json"), "a") as new_file :
+            json.dump(data, new_file, ensure_ascii = False, indent=4)
+
+    elif jsonfile[:2] == 'p4':
+        with open(jsonfile, 'r') as file:
+            data1 = json.load(file)
+            hot_data += data1[:9]
+            data += data1[9:]
+        # 파일 새로 쓰기
+        with open(os.path.join(path_dir,"p4_hot.json"), "a") as new_file :
+            json.dump(hot_data, new_file, ensure_ascii = False, indent=4)  
+        with open(os.path.join(path_dir,"p4_recent.json"), "a") as new_file :
+            json.dump(data, new_file, ensure_ascii = False, indent=4)
+
+    elif jsonfile[:2] == 'p5':
+        with open(jsonfile, 'r') as file:
+            data1 = json.load(file)
+            hot_data += data1[:9]
+            data += data1[9:]
+        # 파일 새로 쓰기
+        with open(os.path.join(path_dir,"p5_hot.json"), "a") as new_file :
+            json.dump(hot_data, new_file, ensure_ascii = False, indent=4)  
+        with open(os.path.join(path_dir,"p5_recent.json"), "a") as new_file :
+            json.dump(data, new_file, ensure_ascii = False, indent=4)
+
+    elif jsonfile[:2] == 'p6':
+        with open(jsonfile, 'r') as file:
+            data1 = json.load(file)
+            hot_data += data1[:9]
+            data += data1[9:]
+        # 파일 새로 쓰기
+        with open(os.path.join(path_dir,"p6_hot.json"), "a") as new_file :
+            json.dump(hot_data, new_file, ensure_ascii = False, indent=4)  
+        with open(os.path.join(path_dir,"p6_recent.json"), "a") as new_file :
+            json.dump(data, new_file, ensure_ascii = False, indent=4)
diff --git a/inscrawler/browser.py b/inscrawler/browser.py
@@ -18,8 +18,8 @@ def __init__(self, has_screen):
         dir_path = os.path.dirname(os.path.realpath(__file__))
         service_args = ["--ignore-ssl-errors=true"]
         chrome_options = Options()
-        if not has_screen:
-            chrome_options.add_argument("--headless")
+        # if not has_screen:
+        #     chrome_options.add_argument("--headless")
         chrome_options.add_argument("--start-maximized")
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("user-agent="+UserAgent().random)
@@ -28,6 +28,7 @@ def __init__(self, has_screen):
             service_args=service_args,
             chrome_options=chrome_options,
         )
+        self.driver.maximize_window()
         self.driver.implicitly_wait(5)
 
     @property

diff --git a/inscrawler/crawler.py b/inscrawler/crawler.py
@@ -190,7 +190,7 @@ def check_next_post(cur_key):
         browser = self.browser
         browser.implicitly_wait(1)
         browser.scroll_down()
-        ele_post = browser.find_one(".v1Nh3 a")
+        ele_post = browser.find_one("._aao7 a")
         ele_post.click()
         dict_posts = {}
 
@@ -268,16 +268,25 @@ def _get_posts(self, num):
         pbar = tqdm(total=num)
 
         def start_fetching(pre_post_num, wait_time):
-            ele_posts = browser.find(".v1Nh3 a")
+            ele_posts = browser.find("._aabd a")
+            # print(ele_posts)
             for ele in ele_posts:
                 key = ele.get_attribute("href")
+                # print(key)
                 if key not in key_set:
                     dict_post = { "key": key }
-                    ele_img = browser.find_one(".KL4Bh img", ele)
+                    ele_img = browser.find_one("div._aagv img", ele)
                     dict_post["caption"] = ele_img.get_attribute("alt")
+                    # print(dict_post["caption"])
                     dict_post["img_url"] = ele_img.get_attribute("src")
-
-                    fetch_details(browser, dict_post)
+                    # print(dict_post["img_url"])
+                    fetch_details(browser, dict_post)   #description 포함
+                    fetch_datetime(browser, dict_post)
+                    # fetch_imgs(browser, dict_post)
+                    fetch_likes_plays(browser, dict_post)
+                    # fetch_likers(browser, dict_post)
+                    # fetch_caption(browser, dict_post)
+                    fetch_comments(browser, dict_post)
 
                     key_set.add(key)
                     posts.append(dict_post)