From 14a1e5fb2afe9c372646ce26a9b2b92c517067b3 Mon Sep 17 00:00:00 2001 From: "Daniel L. Polanco" Date: Sun, 1 Mar 2015 21:39:38 -0700 Subject: [PATCH] Add initial version of forge download scraper --- minecraft-server/minecraftforge_spider.py | 68 +++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 minecraft-server/minecraftforge_spider.py diff --git a/minecraft-server/minecraftforge_spider.py b/minecraft-server/minecraftforge_spider.py new file mode 100644 index 00000000..799d0fb2 --- /dev/null +++ b/minecraft-server/minecraftforge_spider.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.contrib.spiders import CrawlSpider, Rule +from scrapy.contrib.linkextractors import LinkExtractor +from scrapy.selector import Selector + +import re + +class Forge(scrapy.Item): + versions = scrapy.Field() + latest = scrapy.Field() + +class ForgeVersions(scrapy.Item): + id = scrapy.Field() + minecraft = scrapy.Field() + type = scrapy.Field() + time = scrapy.Field() + url = scrapy.Field() + +class ForgeLatest(scrapy.Item): + forge_latest = scrapy.Field() + forge_recommended = scrapy.Field() + +class ForgeSpider(CrawlSpider): + name = "ForgeSpider" + allowed_domains = ["minecraftforge.net"] + start_urls = ['http://files.minecraftforge.net'] + + def parse(self, response): + forge = Forge() + forge['versions'] = [] + forge['latest'] = ForgeLatest() + + selector = Selector(response) + rows = selector.xpath('//table[@id="promotions_table"]//tr') + header = rows.pop(0) + for row in rows: + cells = row.xpath('td') + + id = cells[1].xpath('text()').extract() + minecraft = cells[2].xpath('text()').extract() + type = cells[0].xpath('text()') + time = cells[3].xpath('text()') + url = cells[4].xpath('a[text()="Installer"]/@href') + + #if has version + has_version = re.match('(.+)\-.+', ''.join(type.extract())) + if has_version: + download = ForgeVersions() + download['id'] = id + download['minecraft'] = minecraft + download['type'] = 'forge_' + ''.join(type.re('([a-zA-Z]+)')).lower() + download['time'] = time.extract() + download['url'] = url.re('http://adf.ly/\d+/(.+)') + + forge['versions'].append(download) + else: + is_recommended = re.match('Recommended', ''.join(type.extract())) + if is_recommended: + download = ForgeLatest() + forge['latest']['forge_recommended'] = id + else: + download = ForgeLatest() + forge['latest']['forge_latest'] = id + + + return forge +