|
10 | 10 |
|
11 | 11 | import org.jsoup.nodes.Document;
|
12 | 12 | import org.jsoup.nodes.Element;
|
| 13 | +import org.jsoup.select.Elements; |
13 | 14 |
|
14 | 15 | import com.rarchives.ripme.ripper.AbstractHTMLRipper;
|
15 | 16 | import com.rarchives.ripme.utils.Http;
|
16 | 17 |
|
17 | 18 | public class CfakeRipper extends AbstractHTMLRipper {
|
18 |
| - |
19 | 19 | public CfakeRipper(URL url) throws IOException {
|
20 |
| - super(url); |
| 20 | + super(url); |
| 21 | + } |
| 22 | + |
| 23 | + @Override |
| 24 | + public String getHost() { |
| 25 | + return "cfake"; |
21 | 26 | }
|
22 | 27 |
|
23 |
| - @Override |
24 |
| - public String getHost() { |
25 |
| - return "cfake"; |
| 28 | + @Override |
| 29 | + public String getDomain() { |
| 30 | + return "cfake.com"; |
| 31 | + } |
| 32 | + |
| 33 | + @Override |
| 34 | + public String getGID(URL url) throws MalformedURLException { |
| 35 | + Pattern p = Pattern.compile("https?://cfake\\.com/images/celebrity/([a-zA-Z1-9_-]*)/\\d+/?$"); |
| 36 | + Matcher m = p.matcher(url.toExternalForm()); |
| 37 | + if (m.matches()) { |
| 38 | + return m.group(1); |
26 | 39 | }
|
| 40 | + throw new MalformedURLException("Expected cfake URL format: " + |
| 41 | + "cfake.com/images/celebrity/MODEL/ID - got " + url + " instead"); |
| 42 | + } |
27 | 43 |
|
28 |
| - @Override |
29 |
| - public String getDomain() { |
30 |
| - return "cfake.com"; |
| 44 | + @Override |
| 45 | + public Document getFirstPage() throws IOException { |
| 46 | + // "url" is an instance field of the superclass |
| 47 | + return Http.url(url).get(); |
| 48 | + } |
| 49 | + |
| 50 | + @Override |
| 51 | + public Document getNextPage(Document doc) throws IOException { |
| 52 | + Element elem = doc.select("div#wrapper_path div#content_path div#num_page").last(); |
| 53 | + if (elem == null) { |
| 54 | + throw new IOException("No more pages (cannot find nav)"); |
31 | 55 | }
|
32 | 56 |
|
33 |
| - @Override |
34 |
| - public String getGID(URL url) throws MalformedURLException { |
35 |
| - Pattern p = Pattern.compile("https?://cfake\\.com/picture/([a-zA-Z1-9_-]*)/\\d+/?$"); |
36 |
| - Matcher m = p.matcher(url.toExternalForm()); |
37 |
| - if (m.matches()) { |
38 |
| - return m.group(1); |
39 |
| - } |
40 |
| - throw new MalformedURLException("Expected cfake URL format: " + |
41 |
| - "cfake.com/picture/MODEL/ID - got " + url + " instead"); |
| 57 | + Element nextAnchor = elem.select("a").first(); |
| 58 | + if (nextAnchor == null) { |
| 59 | + throw new IOException("No more pages (cannot find anchor)"); |
42 | 60 | }
|
43 | 61 |
|
44 |
| - @Override |
45 |
| - public Document getFirstPage() throws IOException { |
46 |
| - // "url" is an instance field of the superclass |
47 |
| - return Http.url(url).get(); |
| 62 | + Elements nextSpans = nextAnchor.select("span"); |
| 63 | + if (nextSpans.isEmpty()) { |
| 64 | + // This is the expected case that we're done iterating. |
| 65 | + throw new IOException("No more pages (last page)"); |
48 | 66 | }
|
49 | 67 |
|
50 |
| - @Override |
51 |
| - public Document getNextPage(Document doc) throws IOException { |
52 |
| - // We use comic-nav-next to the find the next page |
53 |
| - Element elem = doc.select("td > div.next > a").first(); |
54 |
| - if (elem == null) { |
55 |
| - throw new IOException("No more pages"); |
56 |
| - } |
57 |
| - String nextPage = elem.attr("href"); |
58 |
| - // Some times this returns a empty string |
59 |
| - // This for stops that |
60 |
| - if (nextPage.equals("")) { |
61 |
| - return null; |
62 |
| - } |
63 |
| - else { |
64 |
| - return Http.url("http://cfake.com" + nextPage).get(); |
65 |
| - } |
66 |
| - } |
67 |
| - |
68 |
| - @Override |
69 |
| - public List<String> getURLsFromPage(Document doc) { |
70 |
| - List<String> result = new ArrayList<>(); |
71 |
| - for (Element el : doc.select("table.display > tbody > tr > td > table > tbody > tr > td > a")) { |
72 |
| - if (el.attr("href").contains("upload")) { |
73 |
| - return result; |
74 |
| - } else { |
75 |
| - String imageSource = el.select("img").attr("src"); |
76 |
| - // We remove the .md from images so we download the full size image |
77 |
| - // not the thumbnail ones |
78 |
| - imageSource = imageSource.replace("thumbs", "photos"); |
79 |
| - result.add("http://cfake.com" + imageSource); |
80 |
| - } |
81 |
| - } |
82 |
| - return result; |
| 68 | + // Use the nextAnchor (parent of the span) for the URL |
| 69 | + String nextPage = nextAnchor.attr("href"); |
| 70 | + |
| 71 | + // Sometimes this returns an empty string; this stops that |
| 72 | + if (nextPage.equals("")) { |
| 73 | + return null; |
| 74 | + } else { |
| 75 | + return Http.url("https://cfake.com" + nextPage).get(); |
83 | 76 | }
|
| 77 | + } |
84 | 78 |
|
85 |
| - @Override |
86 |
| - public void downloadURL(URL url, int index) { |
87 |
| - addURLToDownload(url, getPrefix(index)); |
| 79 | + @Override |
| 80 | + public List<String> getURLsFromPage(Document doc) { |
| 81 | + List<String> result = new ArrayList<>(); |
| 82 | + for (Element el : doc.select("div#media_content .responsive .gallery > a img")) { |
| 83 | + // Convert found src value e.g. /medias/thumbs/2025/17358722979850276d_cfake.jpg |
| 84 | + // to photo src value e.g. |
| 85 | + // https://cfake.com/medias/photos/2025/17358722979850276d_cfake.jpg |
| 86 | + String imageSource = el.attr("src"); |
| 87 | + imageSource = imageSource.replace("thumbs", "photos"); |
| 88 | + result.add("https://cfake.com" + imageSource); |
88 | 89 | }
|
| 90 | + |
| 91 | + return result; |
| 92 | + } |
| 93 | + |
| 94 | + @Override |
| 95 | + public void downloadURL(URL url, int index) { |
| 96 | + addURLToDownload(url, getPrefix(index)); |
89 | 97 | }
|
| 98 | +} |
0 commit comments