Skip to content

Commit b7b67ee

Browse files
authored
Merge pull request #1103 from AliasQli/sitemap
Divide sitemap into parts
2 parents 3b75f8c + b9330e0 commit b7b67ee

File tree

3 files changed

+115
-29
lines changed

3 files changed

+115
-29
lines changed

src/Distribution/Server/Features.hs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ initHackageFeatures env@ServerEnv{serverVerbosity = verbosity} = do
339339
coreFeature
340340
documentationCoreFeature
341341
tagsFeature
342+
tarIndexCacheFeature
342343

343344
packageFeedFeature <- mkPackageFeedFeature
344345
coreFeature

src/Distribution/Server/Features/Sitemap.hs

Lines changed: 97 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{-# LANGUAGE RecordWildCards, NamedFieldPuns, RecursiveDo #-}
2+
{-# LANGUAGE TupleSections #-}
23

34
module Distribution.Server.Features.Sitemap (
45
SitemapFeature(..)
@@ -25,7 +26,25 @@ import Data.ByteString.Lazy (ByteString)
2526
import Data.Time.Clock (UTCTime(..), getCurrentTime)
2627
import Data.Time.Calendar (showGregorian)
2728
import Network.URI
29+
import Control.DeepSeq
30+
import Text.Read
31+
import Data.List.Split
32+
import Distribution.Server.Framework.BlobStorage
33+
import Distribution.Server.Features.TarIndexCache
34+
import qualified Data.TarIndex as Tar
35+
import System.FilePath (takeExtension)
2836

37+
data Sitemap
38+
= Sitemap
39+
{ sitemapIndex :: XMLResponse
40+
, sitemaps :: [XMLResponse]
41+
}
42+
43+
instance NFData Sitemap where
44+
rnf (Sitemap i s) = rnf i `seq` rnf s
45+
46+
instance MemSize Sitemap where
47+
memSize (Sitemap i s) = memSize2 i s
2948

3049
data SitemapFeature = SitemapFeature {
3150
sitemapFeatureInterface :: HackageFeature
@@ -38,6 +57,7 @@ initSitemapFeature :: ServerEnv
3857
-> IO ( CoreFeature
3958
-> DocumentationFeature
4059
-> TagsFeature
60+
-> TarIndexCacheFeature
4161
-> IO SitemapFeature)
4262

4363
initSitemapFeature env@ServerEnv{ serverCacheDelay,
@@ -46,10 +66,11 @@ initSitemapFeature env@ServerEnv{ serverCacheDelay,
4666

4767
return $ \coref@CoreFeature{..}
4868
docsCore@DocumentationFeature{..}
49-
tagsf@TagsFeature{..} -> do
69+
tagsf@TagsFeature{..}
70+
tarf@TarIndexCacheFeature{..} -> do
5071

5172
rec let (feature, updateSitemapCache) =
52-
sitemapFeature env coref docsCore tagsf
73+
sitemapFeature env coref docsCore tagsf tarf
5374
initTime sitemapCache
5475

5576
sitemapCache <- newAsyncCacheNF updateSitemapCache
@@ -66,63 +87,85 @@ sitemapFeature :: ServerEnv
6687
-> CoreFeature
6788
-> DocumentationFeature
6889
-> TagsFeature
90+
-> TarIndexCacheFeature
6991
-> UTCTime
70-
-> AsyncCache XMLResponse
71-
-> (SitemapFeature, IO XMLResponse)
92+
-> AsyncCache Sitemap
93+
-> (SitemapFeature, IO Sitemap)
7294
sitemapFeature ServerEnv{..}
7395
CoreFeature{..}
7496
DocumentationFeature{..}
7597
TagsFeature{..}
98+
TarIndexCacheFeature{cachedTarIndex}
7699
initTime
77100
sitemapCache
78101
= (SitemapFeature{..}, updateSitemapCache)
79102
where
80103

81104
sitemapFeatureInterface = (emptyHackageFeature "sitemap") {
82-
featureResources = [ xmlSitemapResource ]
105+
featureResources = [ xmlSitemapIndexResource, xmlSitemapResource ]
83106
, featureState = []
84-
, featureDesc = "Provides a sitemap.xml for search engines"
107+
, featureDesc = "Provides sitemap for search engines"
85108
, featureCaches =
86109
[ CacheComponent {
87-
cacheDesc = "sitemap.xml",
110+
cacheDesc = "sitemap",
88111
getCacheMemSize = memSize <$> readAsyncCache sitemapCache
89112
}
90113
]
91114
, featurePostInit = do
92115
syncAsyncCache sitemapCache
93116
addCronJob serverCron CronJob {
94-
cronJobName = "regenerate the cached sitemap.xml",
117+
cronJobName = "regenerate the cached sitemap",
95118
cronJobFrequency = DailyJobFrequency,
96119
cronJobOneShot = False,
97120
cronJobAction = prodAsyncCache sitemapCache "cron"
98121
}
99122
}
100123

124+
xmlSitemapIndexResource :: Resource
125+
xmlSitemapIndexResource = (resourceAt "/sitemap_index.xml") {
126+
resourceDesc = [(GET, "The dynamically generated sitemap index, in XML format")]
127+
, resourceGet = [("xml", serveSitemapIndex)]
128+
}
129+
101130
xmlSitemapResource :: Resource
102-
xmlSitemapResource = (resourceAt "/sitemap.xml") {
131+
xmlSitemapResource = (resourceAt "/sitemap/:filename") {
103132
resourceDesc = [(GET, "The dynamically generated sitemap, in XML format")]
104133
, resourceGet = [("xml", serveSitemap)]
105134
}
106135

107-
serveSitemap :: DynamicPath -> ServerPartE Response
108-
serveSitemap _ = do
109-
sitemapXML <- liftIO $ readAsyncCache sitemapCache
136+
serveSitemapIndex :: DynamicPath -> ServerPartE Response
137+
serveSitemapIndex _ = do
138+
Sitemap{..} <- liftIO $ readAsyncCache sitemapCache
110139
cacheControlWithoutETag [Public, maxAgeDays 1]
111-
return (toResponse sitemapXML)
140+
return (toResponse sitemapIndex)
141+
142+
serveSitemap :: DynamicPath -> ServerPartE Response
143+
serveSitemap dpath =
144+
case lookup "filename" dpath of
145+
Just filename
146+
| [basename, "xml"] <- splitOn "." filename
147+
, Just i <- readMaybe basename -> do
148+
Sitemap{..} <- liftIO $ readAsyncCache sitemapCache
149+
guard (i < length sitemaps)
150+
cacheControlWithoutETag [Public, maxAgeDays 1]
151+
return (toResponse (sitemaps !! i))
152+
_ -> mzero
112153

113154
-- Generates a list of sitemap entries corresponding to hackage pages, then
114155
-- builds and returns an XML sitemap.
115-
updateSitemapCache :: IO XMLResponse
156+
updateSitemapCache :: IO Sitemap
116157
updateSitemapCache = do
117158

118159
alltags <- queryGetTagList
119160
pkgIndex <- queryGetPackageIndex
120161
docIndex <- queryDocumentationIndex
121162

122-
let sitemap = generateSitemap serverBaseURI pageBuildDate
163+
sitemaps <- generateSitemap serverBaseURI pageBuildDate
123164
(map fst alltags)
124-
pkgIndex docIndex
125-
return (XMLResponse sitemap)
165+
pkgIndex docIndex cachedTarIndex
166+
let uriScheme i = "/sitemap/" <> show i <> ".xml"
167+
sitemapIndex = renderSitemapIndex serverBaseURI (map uriScheme [0..(length sitemaps - 1)])
168+
return $ Sitemap (XMLResponse sitemapIndex) (map XMLResponse sitemaps)
126169

127170
pageBuildDate :: T.Text
128171
pageBuildDate = T.pack (showGregorian (utctDay initTime))
@@ -131,19 +174,21 @@ generateSitemap :: URI
131174
-> T.Text
132175
-> [Tag]
133176
-> PackageIndex.PackageIndex PkgInfo
134-
-> Map.Map PackageId a
135-
-> ByteString
136-
generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex =
137-
renderSitemap serverBaseURI allEntries
177+
-> Map.Map PackageId BlobId
178+
-> (BlobId -> IO Tar.TarIndex)
179+
-> IO [ByteString]
180+
generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex cachedTarIndex = do
181+
versionedDocSubEntries <- versionedDocSubEntriesIO
182+
let -- Combine and build sitemap
183+
allEntries = miscEntries
184+
++ tagEntries
185+
++ nameEntries
186+
++ nameVersEntries
187+
++ baseDocEntries
188+
++ versionedDocEntries
189+
++ versionedDocSubEntries
190+
pure $ renderSitemap serverBaseURI <$> chunksOf 50000 allEntries
138191
where
139-
-- Combine and build sitemap
140-
allEntries = miscEntries
141-
++ tagEntries
142-
++ nameEntries
143-
++ nameVersEntries
144-
++ baseDocEntries
145-
++ versionedDocEntries
146-
147192
-- Misc. pages
148193
-- e.g. ["http://myhackage.com/index", ...]
149194
miscEntries = urlsToSitemapEntries miscPages pageBuildDate Weekly 0.75
@@ -224,3 +269,26 @@ generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex =
224269
, Map.member (packageId pkg) docIndex
225270
]
226271
pageBuildDate Monthly 0.25
272+
273+
-- Versioned doc pages in subdirectories
274+
-- versionedSubDocURIs :: [path :: String]
275+
-- e.g. ["http://myhackage.com/packages/mypackage-1.0.2/docs/Lib.html", ...]
276+
versionedDocSubEntriesIO = do
277+
let pkgs = [ (pkg , blob)
278+
| pkg <- concat pkgss
279+
, Just blob <- [Map.lookup (packageId pkg) docIndex]
280+
]
281+
pkgIndices <- traverse (\(pkg, blob) -> (pkg,) <$> cachedTarIndex blob) pkgs
282+
pure $ urlsToSitemapEntries
283+
[ prefixPkgURI ++ display (packageId pkg) ++ "/docs" ++ fp
284+
| (pkg, tarIndex) <- pkgIndices
285+
, Just tar <- [Tar.lookup tarIndex ""]
286+
, fp <- entryToPaths "/" tar
287+
, takeExtension fp == ".html"
288+
]
289+
pageBuildDate Monthly 0.25
290+
291+
entryToPaths :: FilePath -> Tar.TarIndexEntry -> [FilePath]
292+
entryToPaths _ (Tar.TarFileEntry _) = []
293+
entryToPaths base (Tar.TarDir content) = map ((base </>) . fst) content ++
294+
[ file | (folder, entry) <- content, file <- entryToPaths (base </> folder) entry ]

src/Distribution/Server/Features/Sitemap/Functions.hs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
module Distribution.Server.Features.Sitemap.Functions (
2424
SitemapEntry
2525
, ChangeFreq(..)
26+
, renderSitemapIndex
2627
, renderSitemap
2728
, urlsToSitemapEntries
2829
, pathsAndDatesToSitemapEntries
@@ -47,6 +48,22 @@ data SitemapEntry = SitemapEntry {
4748

4849
data ChangeFreq = Monthly | Weekly | Daily
4950

51+
-- | Generate a sitemap index file from each sitemap uri.
52+
renderSitemapIndex :: URI -> [String] -> ByteString
53+
renderSitemapIndex serverBaseURI sitemaps =
54+
xrender $
55+
doc defaultDocInfo $
56+
xelem "sitemapindex" $
57+
xattr "xmlns" "http://www.sitemaps.org/schemas/sitemap/0.9"
58+
<#> map renderLink sitemaps
59+
where
60+
serverBaseURI' = T.pack (show serverBaseURI)
61+
renderLink :: String -> Xml Elem
62+
renderLink uri = xelem "sitemap" $
63+
xelems [
64+
xelem "loc" (xtext (serverBaseURI' <> T.pack (uri)))
65+
]
66+
5067
-- | Primary function - generates the XML file from a list of Nodes.
5168
renderSitemap :: URI -> [SitemapEntry] -> ByteString
5269
renderSitemap serverBaseURI entries =

0 commit comments

Comments
 (0)