Hacker News

Resources for using the Hacker News API

Import Libraries

External Libraries

from urllib.request import urlopen
import json
import pandas as pd

Get Stories

def get_stories(
    category: str = 'topstories', 
    number: int = 5, 
    include_kids: bool = False
    ) -> pd.DataFrame:
    """Return a DataFrame of Hacker News Stories

    Args:
        category (str): topstories|newstories|beststories
        number (int): total number of stories to return
        include_kids (bool, optional): normalize with comments column. Defaults to False.

    Returns:
        pd.DataFrame: table of stories
    """
    # Define url variables for stories
    stories_list_base_url = 'https://hacker-news.firebaseio.com/v0/'
    stories_list_prefix_url = category
    stories_list_suffix_url = '.json'
    stories_list_url = \
        stories_list_base_url + \
        stories_list_prefix_url + \
        stories_list_suffix_url
    # Get list of stories
    response = urlopen(stories_list_url)
    data_json = response.read().decode('utf-8')
    stories_list = json.loads(data_json)[:number]
    # Define url variables for individual story
    story_base_url = 'https://hacker-news.firebaseio.com/v0/item/'
    # story_prefix_url = '1'
    story_suffix_url = '.json'
    column_names = [
        'by',
        'descendents',
        'id',
        'kids',
        'score',
        'time',
        'title',
        'type',
        'url'
    ]
    stories_df = pd.DataFrame(columns = column_names)
    # Retrieve json data for each story
    for story_item_num in stories_list:
        story_url = \
            story_base_url + \
            str(story_item_num) + \
            story_suffix_url
        story_df = pd.read_json(story_url)
        stories_df = pd.concat([stories_df, story_df])
    # Remove comment ids if requested
    if include_kids == False:
        stories_df.drop('kids', inplace = True, axis = 1)
        stories_df.drop_duplicates(inplace = True, ignore_index = True)
        return stories_df
    else:
        return stories_df
get_stories('topstories', 3, False)
by descendents id score time title type url descendants
0 cpeterso NaN 31523019 39 1653597069 Proton Is Trying to Become Google–Without Your... story https://www.wired.com/story/proton-mail-calend... 20.0
1 sgbeal NaN 31518618 498 1653574719 SQLite 3 Fiddle story https://sqlite.org/fiddle/ 83.0
2 hwayne NaN 31520483 222 1653583255 Which dinosaurs lived in your hometown? story https://dinosaurpictures.org/ancient-earth#260 67.0
three_stories = get_stories('topstories', 3, False)
type(three_stories)
pandas.core.frame.DataFrame
three_stories['title'].to_list()
['Proton Is Trying to Become Google–Without Your Data',
 'SQLite 3 Fiddle',
 'Which dinosaurs lived in your hometown?']