sakship31 · Merkousha · Apr 10, 2023
diff --git a/News-Aggregator/NewsAggregator/__pycache__/__init__.cpython-38.pyc b/News-Aggregator/NewsAggregator/__pycache__/__init__.cpython-38.pyc
diff --git a/News-Aggregator/NewsAggregator/__pycache__/settings.cpython-38.pyc b/News-Aggregator/NewsAggregator/__pycache__/settings.cpython-38.pyc
diff --git a/News-Aggregator/NewsAggregator/__pycache__/urls.cpython-38.pyc b/News-Aggregator/NewsAggregator/__pycache__/urls.cpython-38.pyc
diff --git a/News-Aggregator/NewsAggregator/__pycache__/wsgi.cpython-38.pyc b/News-Aggregator/NewsAggregator/__pycache__/wsgi.cpython-38.pyc
diff --git a/News-Aggregator/NewsAggregator/settings.py b/News-Aggregator/NewsAggregator/settings.py
@@ -120,3 +120,4 @@
 # https://docs.djangoproject.com/en/3.0/howto/static-files/
 
 STATIC_URL = '/static/'
+STATICFILES_DIRS = [BASE_DIR +'/'+ 'static']
diff --git a/News-Aggregator/db.sqlite3 b/News-Aggregator/db.sqlite3
diff --git a/News-Aggregator/news/__pycache__/__init__.cpython-38.pyc b/News-Aggregator/news/__pycache__/__init__.cpython-38.pyc
diff --git a/News-Aggregator/news/__pycache__/admin.cpython-38.pyc b/News-Aggregator/news/__pycache__/admin.cpython-38.pyc
diff --git a/News-Aggregator/news/__pycache__/models.cpython-38.pyc b/News-Aggregator/news/__pycache__/models.cpython-38.pyc
diff --git a/News-Aggregator/news/__pycache__/urls.cpython-38.pyc b/News-Aggregator/news/__pycache__/urls.cpython-38.pyc
diff --git a/News-Aggregator/news/__pycache__/views.cpython-38.pyc b/News-Aggregator/news/__pycache__/views.cpython-38.pyc
diff --git a/News-Aggregator/news/migrations/0001_initial.py b/News-Aggregator/news/migrations/0001_initial.py
@@ -18,6 +18,7 @@ class Migration(migrations.Migration):
                 ('title', models.CharField(max_length=200)),
                 ('image', models.URLField(blank=True, null=True)),
                 ('url', models.TextField()),
+                ('date', models.TextField()),
             ],
         ),
     ]
diff --git a/News-Aggregator/news/migrations/__pycache__/0001_initial.cpython-38.pyc b/News-Aggregator/news/migrations/__pycache__/0001_initial.cpython-38.pyc
diff --git a/News-Aggregator/news/migrations/__pycache__/__init__.cpython-38.pyc b/News-Aggregator/news/migrations/__pycache__/__init__.cpython-38.pyc
diff --git a/News-Aggregator/news/models.py b/News-Aggregator/news/models.py
@@ -6,5 +6,6 @@ class Headline(models.Model):
   title = models.CharField(max_length=200)
   image = models.URLField(null=True, blank=True)
   url = models.TextField()
+  date = models.TextField()
   def __str__(self):
     return self.title
diff --git a/News-Aggregator/news/static/news/static/favicon.ico b/News-Aggregator/news/static/news/static/favicon.ico
diff --git a/News-Aggregator/news/static/news/static/main.css b/News-Aggregator/news/static/news/static/main.css
@@ -0,0 +1,40 @@
+:root {
+    --jumbotron-padding-y: 3rem;
+  }
+
+  .jumbotron {
+    padding-top: var(--jumbotron-padding-y);
+    padding-bottom: var(--jumbotron-padding-y);
+    margin-bottom: 0;
+    background-color: #fff;
+  }
+  @media (min-width: 768px) {
+    .jumbotron {
+      padding-top: calc(var(--jumbotron-padding-y) * 2);
+      padding-bottom: calc(var(--jumbotron-padding-y) * 2);
+    }
+  }
+
+  .jumbotron p:last-child {
+    margin-bottom: 0;
+  }
+
+  .jumbotron-heading {
+    font-weight: 300;
+  }
+
+  .jumbotron .container {
+    max-width: 40rem;
+  }
+
+  footer {
+    padding-top: 3rem;
+    padding-bottom: 3rem;
+  }
+
+  footer p {
+    margin-bottom: .25rem;
+  }
+
+  .box-shadow { box-shadow: 0 .25rem .75rem rgba(0, 0, 0, .05); }
+
diff --git a/News-Aggregator/news/templates/news/home.html b/News-Aggregator/news/templates/news/home.html
@@ -1,34 +1,107 @@
 <!DOCTYPE html>
-<html>
+<html lang="en">
 <head>
-    <title></title>
-    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <meta http-equiv="X-UA-Compatible" content="ie=edge">
+  <title>News Aggregator Home Page</title>
+
+  <!-- Bootstrap CSS -->
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
+  {% load static %}
+  <link rel="stylesheet" type="text/css" href="{% static 'news/static/main.css' %}">
+
 </head>
-<body style="background-color: black;">
-    <div class="jumbotron" style="background-color: rgb(83, 81, 81);">
-        <center><h1 style="color:black;font-weight: bold;font-size: 80px;">News Aggregator</h1>
-            <a href="{% url 'scrape' %}" class="btn btn-success" style="background-color: black;margin-top: 20px;width: 190px;height: 80px;font-weight: bold;font-size: 30px;padding: 10px;">Load news</a>
-        </form>
-    </center>
+<body>
+<header>
+  <!-- Navigation -->
+  <div class="collapse bg-dark" id="navbarHeader">
+    <div class="container">
+      <div class="row">
+        <div class="col-sm-8 col-md-7 py-4">
+          <h4 class="text-white">About</h4>
+          <p class="text-muted">
+            {% comment %} Our news aggregator works in 3 steps:<br>
+1.It scrapes the news website for the articles.In this Django project, we are scraping a website '<a href="http://www.theonion.com" rel="nofollow">www.theonion.com</a>'<br>
+(We have scraped news articles from 'latest' section of '<a href="http://www.theonion.com" rel="nofollow">www.theonion.com</a>' for demonstration)<br>
+2.Then it stores the article’s images, links, and title.<br>
+3.The stored objects in the database are served to the client. The client gets information in a nice template by clicking the 'Load news' button<br> {% endcomment %}
+          </p>
+        </div>
+        <div class="col-sm-4 offset-md-1 py-4">
+          <h4 class="text-white">Contact</h4>
+          <ul class="list-unstyled">
+            <li><a href="https://github.com/sakship31/News-Aggregator" class="text-white">Our Github Repository</a></li>
+          </ul>
+        </div>
+      </div>
     </div>
-  <div class="card-columns" style="padding: 10px; margin-left: 150px;margin-right: 10px;">
-    {% for object in object_list %}
-    <div class="card" style="width: 18rem;border:5px white solid;margin-bottom: 50px;">
-  <img class="card-img-top" src = "{{ object.image }}">
-  <div class="card-body">
-    <h5 class="card-title"><div class="card-body">
-      <a href="{{object.url}}"><h5 class="card-title">{{object.title}}</h5></a>
-    </div></h5>
+  </div>
+  <div class="navbar navbar-dark bg-dark box-shadow">
+    <div class="container d-flex justify-content-between">
+      <a href="#" class="navbar-brand d-flex align-items-center">
+        <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
+          <path fill="none" d="M0 0h24v24H0V0z"/>
+          <path d="M10 2C5.03 2 1 6.03 1 11s4.03 9 9 9 9-4.03 9-9-4.03-9-9-9zm0 16c-3.87 0-7-3.13-7-7s3.13-7 7-7 7 3.13 7 7-3.13 7-7 7zm5.59-7.59l-2.83-2.83c-.39-.39-1.02-.39-1.41 0l-.71.71c-.39.39-.39 1.02 0 1.41l2.83 2.83c.39.39 1.02.39 1.41 0l.71-.71c.39-.39.39-1.02 0-1.41z"/>
+        </svg>
+        <strong>News aggregator</strong>
+      </a>
+      <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarHeader" aria-controls="navbarHeader" aria-expanded="false" aria-label="Toggle navigation">
+        <span class="navbar-toggler-icon"></span>
+      </button>
     </div>
   </div>
-  {% endfor %}
-</div>
+</header>
+  <main role="main">
+  <section class="jumbotron text-center">
+    <div class="container">
+      <h1 class="jumbotron-heading">About this project</h1>
+      <p class="lead text-muted">News aggregator is a Django project to scrape a news website using Beautiful soup and request module and hence combination of web crawlers and web applications. Both of these technologies have their implementation in Python.</p>
+      <p>
+        <a href="{% url 'scrape' %}" class="btn btn-primary my-2">Load News with scrapping</a>
+        <a href="{% url 'rss_scrape' %}" class="btn btn-secondary my-2">Loda News with rss feeder</a>
+      </p>
+      {% if new_feed %}
+      <div class="alert alert-success" role="alert">
+        Feeds updated successfully !
+      </div>
+      {%else%}
+      <div class="alert alert-primary" role="alert">
+        there is no new Feeds! site is updated!
+      </div>
+      {% endif %}
+    </div>
+  </section>
+  <!-- Page Content -->
+  <div class="album py-5 bg-light">  
+  <div class="container my-4">
+    <div class="row mb-12 text-center">
+      <div class="col-12">
+        <h1 class="font-weight-bold mb-4">Latest News</h1>
+      </div>
+        <!-- Latest News -->
+        {% for object in object_list %}
+        <div class="col-4">
+        <div class="card mb-4 box-shadow">
+          <img class="card-img-top" src="{{ object.image }}" alt="Card image cap">
+          <div class="card-body">
+            <h2 class="card-title"></h2>
+            <p class="card-text">{{object.title}}</p>
+            <a href="{{object.url}}" class="btn btn-primary">Read More &rarr;</a>
+          </div>
+          <div class="card-footer text-muted">
+            Posted on {{object.date}} 
+          </div>
+        </div>
+        </div>
+        {% endfor %}
+    </div>
+  </div>
+</main>
 </div>
-    <script
-src="http://code.jquery.com/jquery-3.3.1.min.js"
-integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8="
-    crossorigin="anonymous"></script>
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.12.9/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
-    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
+  <!-- Bootstrap JS -->
+<script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha384-KJ3o2DKtIkvYIK3UENzmM7KCkRr/rE9/Qpg6aAZGJwFDMVNA/GpGFF93hXpG5KkN" crossorigin="anonymous"></script>
+<script src="https://cdn.jsdelivr.net/npm/popper.js@1.12.9/dist/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
+<script src="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
 </body>
 </html>
diff --git a/News-Aggregator/news/urls.py b/News-Aggregator/news/urls.py
@@ -1,6 +1,7 @@
 from django.urls import path
-from news.views import scrape, news_list
+from news.views import scrape, news_list,rss_scrape
 urlpatterns = [
   path('scrape/', scrape, name="scrape"),
+  path('rss_scrape/', rss_scrape, name="rss_scrape"),
   path('', news_list, name="home"),
 ]
diff --git a/News-Aggregator/news/views.py b/News-Aggregator/news/views.py
@@ -1,39 +1,99 @@
 from django.shortcuts import render
-import requests
 from django.shortcuts import render, redirect
 from bs4 import BeautifulSoup as BSoup
 from news.models import Headline
+import requests
+import feedparser
+from datetime import datetime
+
+feed_url = 'https://www.theonion.com/rss'
+
+# Scrage web feed
 
-# Create your views here.
 
 def scrape(request):
-  Headline.objects.all().delete()
-  session = requests.Session()
-  session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
-  url = "https://www.theonion.com/latest"
-  content = session.get(url).content
-  soup = BSoup(content, "html.parser")
-  News = soup.find_all('div', {"class":"cw4lnv-11 dFCKPx"})
-  for article in News:
-    main = article.find_all('a',href=True)
-    linkx = article.find('a', {"class":"sc-1out364-0 hMndXN js_link"})
-    link=linkx['href']
-    imgx=main[0].find('img',src=True)
-    image_src=imgx['data-srcset'].split(" ")[-4]
-    titlex = article.find('h2', {"class":"sc-759qgu-0 iRbzKE cw4lnv-6 pdtMb"})
-    title = titlex.text
-    new_headline = Headline()
-    #Headline.objects.all().delete()
-    new_headline.title = title
-    new_headline.url = link
-    new_headline.image = image_src
-    new_headline.save()
-  return redirect("../")
+
+    session = requests.Session()
+    session.headers = {
+        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
+    # the onion.com  is unstable on his class names , they're using random pregenerated class names that changes
+    # in other hand the guardian has static class names and we cand trust them for a long time
+    url = "https://www.theguardian.com/world"
+    content = session.get(url).content
+    soup = BSoup(content, "html.parser")
+    News = soup.find_all('div', {"class": "fc-item__container"})
+    is_there_any_new_feed = update_scrap_headline(News)
+    request.session['new_feeds'] = is_there_any_new_feed
+    return redirect("../")
+
+
+def update_scrap_headline(News):
+    print(len(News))
+    is_there_any_new_feed = False
+    for article in News:
+        # Extracting the image source
+        image_src = article.find(
+            'div', {'class': 'fc-item__image-container'}).find('img')['src']
+        # Extracting the title, link and body
+        title = article.find('span', {'class': 'js-headline-text'}).text
+        link = article.find('a', {'class': 'fc-item__link'})['href']
+        body = article.find(
+            'div', {'class': 'fc-item__standfirst'}).text.strip()
+        if not headline_is_exists(link):
+            is_there_any_new_feed = True
+            new_headline = Headline()
+            new_headline.title = title
+            new_headline.url = link
+            new_headline.image = image_src
+            new_headline.date = "undefined date"
+            new_headline.save()
+
+    return is_there_any_new_feed
+
+
+# Read the RSS feed
+def rss_scrape(request):
+    feed = feedparser.parse(feed_url)
+    is_there_any_new_feed = update_rss_headlines(feed)
+    # Set a value in the session
+    request.session['new_feeds'] = is_there_any_new_feed
+    return redirect("../")
 
 
 def news_list(request):
+    # Get the value from the session using the parameter
+    new_feed = request.session.get("new_feeds")
     headlines = Headline.objects.all()[::-1]
     context = {
         'object_list': headlines,
+        'new_feed': new_feed
     }
+    request.session['new_feeds'] = False
     return render(request, "news/home.html", context)
+
+
+# update headlines to DB
+def update_rss_headlines(feed):
+    is_there_any_new_feed = False
+    print(headline_is_exists("vlowblow"))
+    print("Feeds len = {}".format(len(feed.entries)))
+
+    for entry in feed.entries:
+        if not headline_is_exists(entry.link):
+            is_there_any_new_feed = True
+            new_headline = Headline()
+            new_headline.title = entry.title
+            new_headline.url = entry.link
+            soup = BSoup(entry.summary, 'html.parser')
+            img_src = soup.find('img')['src']
+            new_headline.image = img_src
+            published_date = datetime(*entry.published_parsed[:6])
+            published_date_str = published_date.strftime('%Y-%m-%d %H:%M:%S')
+            new_headline.date = published_date_str
+            new_headline.save()
+    return is_there_any_new_feed
+
+
+# check if headline not exist in our DB
+def headline_is_exists(entrylink):
+    return Headline.objects.filter(url=entrylink).exists()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -120,3 +120,4 @@
		# https://docs.djangoproject.com/en/3.0/howto/static-files/

		STATIC_URL = '/static/'
		STATICFILES_DIRS = [BASE_DIR +'/'+ 'static']