Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Beatify Home Page & add Rss & refactor scrapper #2

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions News-Aggregator/NewsAggregator/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,4 @@
# https://docs.djangoproject.com/en/3.0/howto/static-files/

STATIC_URL = '/static/'
STATICFILES_DIRS = [BASE_DIR +'/'+ 'static']
Binary file modified News-Aggregator/db.sqlite3
Binary file not shown.
Binary file not shown.
Binary file removed News-Aggregator/news/__pycache__/admin.cpython-38.pyc
Binary file not shown.
Binary file removed News-Aggregator/news/__pycache__/models.cpython-38.pyc
Binary file not shown.
Binary file removed News-Aggregator/news/__pycache__/urls.cpython-38.pyc
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions News-Aggregator/news/migrations/0001_initial.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class Migration(migrations.Migration):
('title', models.CharField(max_length=200)),
('image', models.URLField(blank=True, null=True)),
('url', models.TextField()),
('date', models.TextField()),
],
),
]
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions News-Aggregator/news/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ class Headline(models.Model):
title = models.CharField(max_length=200)
image = models.URLField(null=True, blank=True)
url = models.TextField()
date = models.TextField()
def __str__(self):
return self.title
Binary file not shown.
40 changes: 40 additions & 0 deletions News-Aggregator/news/static/news/static/main.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
:root {
--jumbotron-padding-y: 3rem;
}

.jumbotron {
padding-top: var(--jumbotron-padding-y);
padding-bottom: var(--jumbotron-padding-y);
margin-bottom: 0;
background-color: #fff;
}
@media (min-width: 768px) {
.jumbotron {
padding-top: calc(var(--jumbotron-padding-y) * 2);
padding-bottom: calc(var(--jumbotron-padding-y) * 2);
}
}

.jumbotron p:last-child {
margin-bottom: 0;
}

.jumbotron-heading {
font-weight: 300;
}

.jumbotron .container {
max-width: 40rem;
}

footer {
padding-top: 3rem;
padding-bottom: 3rem;
}

footer p {
margin-bottom: .25rem;
}

.box-shadow { box-shadow: 0 .25rem .75rem rgba(0, 0, 0, .05); }

123 changes: 98 additions & 25 deletions News-Aggregator/news/templates/news/home.html
Original file line number Diff line number Diff line change
@@ -1,34 +1,107 @@
<!DOCTYPE html>
<html>
<html lang="en">
<head>
<title></title>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>News Aggregator Home Page</title>

<!-- Bootstrap CSS -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
{% load static %}
<link rel="stylesheet" type="text/css" href="{% static 'news/static/main.css' %}">

</head>
<body style="background-color: black;">
<div class="jumbotron" style="background-color: rgb(83, 81, 81);">
<center><h1 style="color:black;font-weight: bold;font-size: 80px;">News Aggregator</h1>
<a href="{% url 'scrape' %}" class="btn btn-success" style="background-color: black;margin-top: 20px;width: 190px;height: 80px;font-weight: bold;font-size: 30px;padding: 10px;">Load news</a>
</form>
</center>
<body>
<header>
<!-- Navigation -->
<div class="collapse bg-dark" id="navbarHeader">
<div class="container">
<div class="row">
<div class="col-sm-8 col-md-7 py-4">
<h4 class="text-white">About</h4>
<p class="text-muted">
{% comment %} Our news aggregator works in 3 steps:<br>
1.It scrapes the news website for the articles.In this Django project, we are scraping a website '<a href="http://www.theonion.com" rel="nofollow">www.theonion.com</a>'<br>
(We have scraped news articles from 'latest' section of '<a href="http://www.theonion.com" rel="nofollow">www.theonion.com</a>' for demonstration)<br>
2.Then it stores the article’s images, links, and title.<br>
3.The stored objects in the database are served to the client. The client gets information in a nice template by clicking the 'Load news' button<br> {% endcomment %}
</p>
</div>
<div class="col-sm-4 offset-md-1 py-4">
<h4 class="text-white">Contact</h4>
<ul class="list-unstyled">
<li><a href="https://github.com/sakship31/News-Aggregator" class="text-white">Our Github Repository</a></li>
</ul>
</div>
</div>
</div>
<div class="card-columns" style="padding: 10px; margin-left: 150px;margin-right: 10px;">
{% for object in object_list %}
<div class="card" style="width: 18rem;border:5px white solid;margin-bottom: 50px;">
<img class="card-img-top" src = "{{ object.image }}">
<div class="card-body">
<h5 class="card-title"><div class="card-body">
<a href="{{object.url}}"><h5 class="card-title">{{object.title}}</h5></a>
</div></h5>
</div>
<div class="navbar navbar-dark bg-dark box-shadow">
<div class="container d-flex justify-content-between">
<a href="#" class="navbar-brand d-flex align-items-center">
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
<path fill="none" d="M0 0h24v24H0V0z"/>
<path d="M10 2C5.03 2 1 6.03 1 11s4.03 9 9 9 9-4.03 9-9-4.03-9-9-9zm0 16c-3.87 0-7-3.13-7-7s3.13-7 7-7 7 3.13 7 7-3.13 7-7 7zm5.59-7.59l-2.83-2.83c-.39-.39-1.02-.39-1.41 0l-.71.71c-.39.39-.39 1.02 0 1.41l2.83 2.83c.39.39 1.02.39 1.41 0l.71-.71c.39-.39.39-1.02 0-1.41z"/>
</svg>
<strong>News aggregator</strong>
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarHeader" aria-controls="navbarHeader" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
</div>
</div>
{% endfor %}
</div>
</header>
<main role="main">
<section class="jumbotron text-center">
<div class="container">
<h1 class="jumbotron-heading">About this project</h1>
<p class="lead text-muted">News aggregator is a Django project to scrape a news website using Beautiful soup and request module and hence combination of web crawlers and web applications. Both of these technologies have their implementation in Python.</p>
<p>
<a href="{% url 'scrape' %}" class="btn btn-primary my-2">Load News with scrapping</a>
<a href="{% url 'rss_scrape' %}" class="btn btn-secondary my-2">Loda News with rss feeder</a>
</p>
{% if new_feed %}
<div class="alert alert-success" role="alert">
Feeds updated successfully !
</div>
{%else%}
<div class="alert alert-primary" role="alert">
there is no new Feeds! site is updated!
</div>
{% endif %}
</div>
</section>
<!-- Page Content -->
<div class="album py-5 bg-light">
<div class="container my-4">
<div class="row mb-12 text-center">
<div class="col-12">
<h1 class="font-weight-bold mb-4">Latest News</h1>
</div>
<!-- Latest News -->
{% for object in object_list %}
<div class="col-4">
<div class="card mb-4 box-shadow">
<img class="card-img-top" src="{{ object.image }}" alt="Card image cap">
<div class="card-body">
<h2 class="card-title"></h2>
<p class="card-text">{{object.title}}</p>
<a href="{{object.url}}" class="btn btn-primary">Read More &rarr;</a>
</div>
<div class="card-footer text-muted">
Posted on {{object.date}}
</div>
</div>
</div>
{% endfor %}
</div>
</div>
</main>
</div>
<script
src="http://code.jquery.com/jquery-3.3.1.min.js"
integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8="
crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.12.9/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
<!-- Bootstrap JS -->
<script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha384-KJ3o2DKtIkvYIK3UENzmM7KCkRr/rE9/Qpg6aAZGJwFDMVNA/GpGFF93hXpG5KkN" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/popper.js@1.12.9/dist/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
</body>
</html>
3 changes: 2 additions & 1 deletion News-Aggregator/news/urls.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from django.urls import path
from news.views import scrape, news_list
from news.views import scrape, news_list,rss_scrape
urlpatterns = [
path('scrape/', scrape, name="scrape"),
path('rss_scrape/', rss_scrape, name="rss_scrape"),
path('', news_list, name="home"),
]
108 changes: 84 additions & 24 deletions News-Aggregator/news/views.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,99 @@
from django.shortcuts import render
import requests
from django.shortcuts import render, redirect
from bs4 import BeautifulSoup as BSoup
from news.models import Headline
import requests
import feedparser
from datetime import datetime

feed_url = 'https://www.theonion.com/rss'

# Scrage web feed

# Create your views here.

def scrape(request):
Headline.objects.all().delete()
session = requests.Session()
session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
url = "https://www.theonion.com/latest"
content = session.get(url).content
soup = BSoup(content, "html.parser")
News = soup.find_all('div', {"class":"cw4lnv-11 dFCKPx"})
for article in News:
main = article.find_all('a',href=True)
linkx = article.find('a', {"class":"sc-1out364-0 hMndXN js_link"})
link=linkx['href']
imgx=main[0].find('img',src=True)
image_src=imgx['data-srcset'].split(" ")[-4]
titlex = article.find('h2', {"class":"sc-759qgu-0 iRbzKE cw4lnv-6 pdtMb"})
title = titlex.text
new_headline = Headline()
#Headline.objects.all().delete()
new_headline.title = title
new_headline.url = link
new_headline.image = image_src
new_headline.save()
return redirect("../")

session = requests.Session()
session.headers = {
"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
# the onion.com is unstable on his class names , they're using random pregenerated class names that changes
# in other hand the guardian has static class names and we cand trust them for a long time
url = "https://www.theguardian.com/world"
content = session.get(url).content
soup = BSoup(content, "html.parser")
News = soup.find_all('div', {"class": "fc-item__container"})
is_there_any_new_feed = update_scrap_headline(News)
request.session['new_feeds'] = is_there_any_new_feed
return redirect("../")


def update_scrap_headline(News):
print(len(News))
is_there_any_new_feed = False
for article in News:
# Extracting the image source
image_src = article.find(
'div', {'class': 'fc-item__image-container'}).find('img')['src']
# Extracting the title, link and body
title = article.find('span', {'class': 'js-headline-text'}).text
link = article.find('a', {'class': 'fc-item__link'})['href']
body = article.find(
'div', {'class': 'fc-item__standfirst'}).text.strip()
if not headline_is_exists(link):
is_there_any_new_feed = True
new_headline = Headline()
new_headline.title = title
new_headline.url = link
new_headline.image = image_src
new_headline.date = "undefined date"
new_headline.save()

return is_there_any_new_feed


# Read the RSS feed
def rss_scrape(request):
feed = feedparser.parse(feed_url)
is_there_any_new_feed = update_rss_headlines(feed)
# Set a value in the session
request.session['new_feeds'] = is_there_any_new_feed
return redirect("../")


def news_list(request):
# Get the value from the session using the parameter
new_feed = request.session.get("new_feeds")
headlines = Headline.objects.all()[::-1]
context = {
'object_list': headlines,
'new_feed': new_feed
}
request.session['new_feeds'] = False
return render(request, "news/home.html", context)


# update headlines to DB
def update_rss_headlines(feed):
is_there_any_new_feed = False
print(headline_is_exists("vlowblow"))
print("Feeds len = {}".format(len(feed.entries)))

for entry in feed.entries:
if not headline_is_exists(entry.link):
is_there_any_new_feed = True
new_headline = Headline()
new_headline.title = entry.title
new_headline.url = entry.link
soup = BSoup(entry.summary, 'html.parser')
img_src = soup.find('img')['src']
new_headline.image = img_src
published_date = datetime(*entry.published_parsed[:6])
published_date_str = published_date.strftime('%Y-%m-%d %H:%M:%S')
new_headline.date = published_date_str
new_headline.save()
return is_there_any_new_feed


# check if headline not exist in our DB
def headline_is_exists(entrylink):
return Headline.objects.filter(url=entrylink).exists()