Hacker News

Contents

Hacker News¶

Resources for using the Hacker News API

About¶

Key	Value
Source	Hacker News API: Documentation and Samples for the Official HN API
Website	Y Combinator - Hacker News

Import Libraries¶

External Libraries¶

from urllib.request import urlopen
import json
import pandas as pd

Get Stories¶

def get_stories(
    category: str = 'topstories', 
    number: int = 5, 
    include_kids: bool = False
    ) -> pd.DataFrame:
    """Return a DataFrame of Hacker News Stories

    Args:
        category (str): topstories|newstories|beststories
        number (int): total number of stories to return
        include_kids (bool, optional): normalize with comments column. Defaults to False.

    Returns:
        pd.DataFrame: table of stories
    """
    # Define url variables for stories
    stories_list_base_url = 'https://hacker-news.firebaseio.com/v0/'
    stories_list_prefix_url = category
    stories_list_suffix_url = '.json'
    stories_list_url = \
        stories_list_base_url + \
        stories_list_prefix_url + \
        stories_list_suffix_url
    # Get list of stories
    response = urlopen(stories_list_url)
    data_json = response.read().decode('utf-8')
    stories_list = json.loads(data_json)[:number]
    # Define url variables for individual story
    story_base_url = 'https://hacker-news.firebaseio.com/v0/item/'
    # story_prefix_url = '1'
    story_suffix_url = '.json'
    column_names = [
        'by',
        'descendents',
        'id',
        'kids',
        'score',
        'time',
        'title',
        'type',
        'url'
    ]
    stories_df = pd.DataFrame(columns = column_names)
    # Retrieve json data for each story
    for story_item_num in stories_list:
        story_url = \
            story_base_url + \
            str(story_item_num) + \
            story_suffix_url
        story_df = pd.read_json(story_url)
        stories_df = pd.concat([stories_df, story_df])
    # Remove comment ids if requested
    if include_kids == False:
        stories_df.drop('kids', inplace = True, axis = 1)
        stories_df.drop_duplicates(inplace = True, ignore_index = True)
        return stories_df
    else:
        return stories_df

get_stories('topstories', 3, False)

	by	descendents	id	score	time	title	type	url	descendants
0	cpeterso	NaN	31523019	39	1653597069	Proton Is Trying to Become Google–Without Your...	story	https://www.wired.com/story/proton-mail-calend...	20.0
1	sgbeal	NaN	31518618	498	1653574719	SQLite 3 Fiddle	story	https://sqlite.org/fiddle/	83.0
2	hwayne	NaN	31520483	222	1653583255	Which dinosaurs lived in your hometown?	story	https://dinosaurpictures.org/ancient-earth#260	67.0

three_stories = get_stories('topstories', 3, False)

type(three_stories)

pandas.core.frame.DataFrame

three_stories['title'].to_list()

['Proton Is Trying to Become Google–Without Your Data',
 'SQLite 3 Fiddle',
 'Which dinosaurs lived in your hometown?']

previous

Google Cloud Platform (GCP)

next

YouTube